@agentuity/evals 0.0.103
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/AGENTS.md +244 -0
- package/README.md +19 -0
- package/dist/_utils.d.ts +63 -0
- package/dist/_utils.d.ts.map +1 -0
- package/dist/_utils.js +102 -0
- package/dist/_utils.js.map +1 -0
- package/dist/adversarial.d.ts +18 -0
- package/dist/adversarial.d.ts.map +1 -0
- package/dist/adversarial.js +73 -0
- package/dist/adversarial.js.map +1 -0
- package/dist/ambiguity.d.ts +24 -0
- package/dist/ambiguity.d.ts.map +1 -0
- package/dist/ambiguity.js +77 -0
- package/dist/ambiguity.js.map +1 -0
- package/dist/answer-completeness.d.ts +24 -0
- package/dist/answer-completeness.d.ts.map +1 -0
- package/dist/answer-completeness.js +79 -0
- package/dist/answer-completeness.js.map +1 -0
- package/dist/conciseness.d.ts +24 -0
- package/dist/conciseness.d.ts.map +1 -0
- package/dist/conciseness.js +78 -0
- package/dist/conciseness.js.map +1 -0
- package/dist/extraneous-content.d.ts +24 -0
- package/dist/extraneous-content.d.ts.map +1 -0
- package/dist/extraneous-content.js +81 -0
- package/dist/extraneous-content.js.map +1 -0
- package/dist/format.d.ts +18 -0
- package/dist/format.d.ts.map +1 -0
- package/dist/format.js +71 -0
- package/dist/format.js.map +1 -0
- package/dist/index.d.ts +15 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +15 -0
- package/dist/index.js.map +1 -0
- package/dist/knowledge-retention.d.ts +24 -0
- package/dist/knowledge-retention.d.ts.map +1 -0
- package/dist/knowledge-retention.js +83 -0
- package/dist/knowledge-retention.js.map +1 -0
- package/dist/pii.d.ts +18 -0
- package/dist/pii.d.ts.map +1 -0
- package/dist/pii.js +68 -0
- package/dist/pii.js.map +1 -0
- package/dist/politeness.d.ts +24 -0
- package/dist/politeness.d.ts.map +1 -0
- package/dist/politeness.js +69 -0
- package/dist/politeness.js.map +1 -0
- package/dist/role-adherence.d.ts +24 -0
- package/dist/role-adherence.d.ts.map +1 -0
- package/dist/role-adherence.js +84 -0
- package/dist/role-adherence.js.map +1 -0
- package/dist/safety.d.ts +18 -0
- package/dist/safety.d.ts.map +1 -0
- package/dist/safety.js +73 -0
- package/dist/safety.js.map +1 -0
- package/dist/self-reference.d.ts +18 -0
- package/dist/self-reference.d.ts.map +1 -0
- package/dist/self-reference.js +63 -0
- package/dist/self-reference.js.map +1 -0
- package/dist/types.d.ts +18 -0
- package/dist/types.d.ts.map +1 -0
- package/dist/types.js +2 -0
- package/dist/types.js.map +1 -0
- package/package.json +45 -0
- package/src/_utils.ts +189 -0
- package/src/adversarial.ts +83 -0
- package/src/ambiguity.ts +96 -0
- package/src/answer-completeness.ts +97 -0
- package/src/conciseness.ts +96 -0
- package/src/extraneous-content.ts +99 -0
- package/src/format.ts +81 -0
- package/src/index.ts +23 -0
- package/src/knowledge-retention.ts +101 -0
- package/src/pii.ts +78 -0
- package/src/politeness.ts +87 -0
- package/src/role-adherence.ts +102 -0
- package/src/safety.ts +83 -0
- package/src/self-reference.ts +75 -0
- package/src/types.ts +19 -0
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
import type { BaseEvalOptions } from './types';
|
|
2
|
+
export declare const answerCompletenessPrompt = "You are evaluating whether an LLM response directly addresses the user's request.\n\n## Inputs\n\n- USER REQUEST: {{USER_REQUEST}}\n- MODEL RESPONSE: {{MODEL_RESPONSE}}\n\n## Your task\n\n1. Assume a strict auditor who expects every part of the user's request to be addressed.\n2. Identify each distinct question, instruction, or requirement in the USER REQUEST.\n3. For each identified item, check whether the MODEL RESPONSE provides a direct, relevant answer or fulfillment.\n4. Flag any of the following violations:\n - Ignoring part of the request entirely\n - Providing tangential information instead of answering\n - Deflecting with \"I can't help with that\" without justification\n - Answering a different question than what was asked\n - Providing vague or generic responses that do not address specifics\n - Excessive hedging or caveats that obscure whether the question was answered\n\n## Score\n\n- Start from 1.0.\n- Subtract points for each violation:\n - Minor omission (small detail or sub-question not addressed): \u22120.2\n - Partial answer (core question addressed but incompletely): \u22120.3\n - Tangential response (related content but does not answer the question): \u22120.5\n - Complete miss (major requirement or question ignored entirely): \u22120.6\n - Refusal without valid justification: \u22120.8\n- Minimum score is 0.0.\n- Multiple violations compound independently.\n\n## Pass/Fail\n\n- passed = true only if score \u2265 0.7 AND no complete misses or unjustified refusals are present.\n\n## Constraints\n\n- Do not credit the response for being correct if it does not address what was asked.\n- Do not credit the response for being helpful on unrelated topics.\n- Do not infer that the user's needs were met unless explicitly addressed in the response.\n- Do not excuse incomplete answers due to response length or complexity.\n\n## Output format (STRICT JSON, one line reason):\n\n{\n \"score\": <number between 0.0 and 1.0>,\n \"passed\": <true|false>,\n \"metadata\": {\n \"reason\": \"<single concise sentence listing which parts of the request were or were not addressed>\"\n }\n}";
|
|
3
|
+
type AnswerCompletenessEvalOptions = BaseEvalOptions & {
|
|
4
|
+
threshold: number;
|
|
5
|
+
};
|
|
6
|
+
export declare const answerCompleteness: <TAgentInput extends import("@agentuity/core").StandardSchemaV1 | undefined = any, TAgentOutput extends import("@agentuity/core").StandardSchemaV1 | undefined = any>(overrides?: (Partial<{
|
|
7
|
+
name?: string;
|
|
8
|
+
description?: string;
|
|
9
|
+
} & BaseEvalOptions & {
|
|
10
|
+
threshold: number;
|
|
11
|
+
}> & {
|
|
12
|
+
middleware?: import("./types").EvalMiddleware<TAgentInput extends import("@agentuity/core").StandardSchemaV1<unknown, unknown> ? import("@agentuity/core").InferOutput<TAgentInput> : any, TAgentOutput extends import("@agentuity/core").StandardSchemaV1<unknown, unknown> ? import("@agentuity/core").InferOutput<TAgentOutput> : any, {
|
|
13
|
+
request: string;
|
|
14
|
+
} & {
|
|
15
|
+
context?: string | undefined;
|
|
16
|
+
}, {
|
|
17
|
+
response: string;
|
|
18
|
+
} & {}> | undefined;
|
|
19
|
+
}) | undefined) => import("@agentuity/runtime").CreateEvalConfig<any, any> & {
|
|
20
|
+
name: string;
|
|
21
|
+
options: AnswerCompletenessEvalOptions;
|
|
22
|
+
};
|
|
23
|
+
export {};
|
|
24
|
+
//# sourceMappingURL=answer-completeness.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"answer-completeness.d.ts","sourceRoot":"","sources":["../src/answer-completeness.ts"],"names":[],"mappings":"AAQA,OAAO,KAAK,EAAE,eAAe,EAAE,MAAM,SAAS,CAAC;AAE/C,eAAO,MAAM,wBAAwB,2mEAmDnC,CAAC;AAEH,KAAK,6BAA6B,GAAG,eAAe,GAAG;IACtD,SAAS,EAAE,MAAM,CAAC;CAClB,CAAC;AAEF,eAAO,MAAM,kBAAkB;;;;eAHnB,MAAM;;;;;;;;;;;;CAgChB,CAAC"}
|
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
import { openai } from '@ai-sdk/openai';
|
|
2
|
+
import { createPresetEval, interpolatePrompt, generateEvalResult, } from './_utils';
|
|
3
|
+
export const answerCompletenessPrompt = `You are evaluating whether an LLM response directly addresses the user's request.
|
|
4
|
+
|
|
5
|
+
## Inputs
|
|
6
|
+
|
|
7
|
+
- USER REQUEST: {{USER_REQUEST}}
|
|
8
|
+
- MODEL RESPONSE: {{MODEL_RESPONSE}}
|
|
9
|
+
|
|
10
|
+
## Your task
|
|
11
|
+
|
|
12
|
+
1. Assume a strict auditor who expects every part of the user's request to be addressed.
|
|
13
|
+
2. Identify each distinct question, instruction, or requirement in the USER REQUEST.
|
|
14
|
+
3. For each identified item, check whether the MODEL RESPONSE provides a direct, relevant answer or fulfillment.
|
|
15
|
+
4. Flag any of the following violations:
|
|
16
|
+
- Ignoring part of the request entirely
|
|
17
|
+
- Providing tangential information instead of answering
|
|
18
|
+
- Deflecting with "I can't help with that" without justification
|
|
19
|
+
- Answering a different question than what was asked
|
|
20
|
+
- Providing vague or generic responses that do not address specifics
|
|
21
|
+
- Excessive hedging or caveats that obscure whether the question was answered
|
|
22
|
+
|
|
23
|
+
## Score
|
|
24
|
+
|
|
25
|
+
- Start from 1.0.
|
|
26
|
+
- Subtract points for each violation:
|
|
27
|
+
- Minor omission (small detail or sub-question not addressed): −0.2
|
|
28
|
+
- Partial answer (core question addressed but incompletely): −0.3
|
|
29
|
+
- Tangential response (related content but does not answer the question): −0.5
|
|
30
|
+
- Complete miss (major requirement or question ignored entirely): −0.6
|
|
31
|
+
- Refusal without valid justification: −0.8
|
|
32
|
+
- Minimum score is 0.0.
|
|
33
|
+
- Multiple violations compound independently.
|
|
34
|
+
|
|
35
|
+
## Pass/Fail
|
|
36
|
+
|
|
37
|
+
- passed = true only if score ≥ 0.7 AND no complete misses or unjustified refusals are present.
|
|
38
|
+
|
|
39
|
+
## Constraints
|
|
40
|
+
|
|
41
|
+
- Do not credit the response for being correct if it does not address what was asked.
|
|
42
|
+
- Do not credit the response for being helpful on unrelated topics.
|
|
43
|
+
- Do not infer that the user's needs were met unless explicitly addressed in the response.
|
|
44
|
+
- Do not excuse incomplete answers due to response length or complexity.
|
|
45
|
+
|
|
46
|
+
## Output format (STRICT JSON, one line reason):
|
|
47
|
+
|
|
48
|
+
{
|
|
49
|
+
"score": <number between 0.0 and 1.0>,
|
|
50
|
+
"passed": <true|false>,
|
|
51
|
+
"metadata": {
|
|
52
|
+
"reason": "<single concise sentence listing which parts of the request were or were not addressed>"
|
|
53
|
+
}
|
|
54
|
+
}`;
|
|
55
|
+
export const answerCompleteness = createPresetEval({
|
|
56
|
+
name: 'answer-completeness',
|
|
57
|
+
description: 'Evaluates whether response fully addresses all parts of the user request',
|
|
58
|
+
options: {
|
|
59
|
+
model: openai('gpt-4o'),
|
|
60
|
+
threshold: 0.7,
|
|
61
|
+
},
|
|
62
|
+
handler: async (ctx, input, output, options) => {
|
|
63
|
+
const prompt = interpolatePrompt(answerCompletenessPrompt, {
|
|
64
|
+
USER_REQUEST: input.request,
|
|
65
|
+
MODEL_RESPONSE: output.response,
|
|
66
|
+
});
|
|
67
|
+
const evaluation = await generateEvalResult({ model: options.model, prompt });
|
|
68
|
+
return {
|
|
69
|
+
passed: evaluation.passed && (evaluation.score ?? 1) >= options.threshold,
|
|
70
|
+
score: evaluation.score,
|
|
71
|
+
metadata: {
|
|
72
|
+
...evaluation.metadata,
|
|
73
|
+
model: options.model,
|
|
74
|
+
threshold: options.threshold,
|
|
75
|
+
},
|
|
76
|
+
};
|
|
77
|
+
},
|
|
78
|
+
});
|
|
79
|
+
//# sourceMappingURL=answer-completeness.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"answer-completeness.js","sourceRoot":"","sources":["../src/answer-completeness.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,MAAM,EAAE,MAAM,gBAAgB,CAAC;AACxC,OAAO,EACN,gBAAgB,EAChB,iBAAiB,EACjB,kBAAkB,GAGlB,MAAM,UAAU,CAAC;AAGlB,MAAM,CAAC,MAAM,wBAAwB,GAAG;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;EAmDtC,CAAC;AAMH,MAAM,CAAC,MAAM,kBAAkB,GAAG,gBAAgB,CAIhD;IACD,IAAI,EAAE,qBAAqB;IAC3B,WAAW,EAAE,0EAA0E;IACvF,OAAO,EAAE;QACR,KAAK,EAAE,MAAM,CAAC,QAAQ,CAAC;QACvB,SAAS,EAAE,GAAG;KACd;IACD,OAAO,EAAE,KAAK,EAAE,GAAG,EAAE,KAAK,EAAE,MAAM,EAAE,OAAO,EAAE,EAAE;QAC9C,MAAM,MAAM,GAAG,iBAAiB,CAAC,wBAAwB,EAAE;YAC1D,YAAY,EAAE,KAAK,CAAC,OAAO;YAC3B,cAAc,EAAE,MAAM,CAAC,QAAQ;SAC/B,CAAC,CAAC;QAEH,MAAM,UAAU,GAAG,MAAM,kBAAkB,CAAC,EAAE,KAAK,EAAE,OAAO,CAAC,KAAK,EAAE,MAAM,EAAE,CAAC,CAAC;QAE9E,OAAO;YACN,MAAM,EAAE,UAAU,CAAC,MAAM,IAAI,CAAC,UAAU,CAAC,KAAK,IAAI,CAAC,CAAC,IAAI,OAAO,CAAC,SAAS;YACzE,KAAK,EAAE,UAAU,CAAC,KAAK;YACvB,QAAQ,EAAE;gBACT,GAAG,UAAU,CAAC,QAAQ;gBACtB,KAAK,EAAE,OAAO,CAAC,KAAK;gBACpB,SAAS,EAAE,OAAO,CAAC,SAAS;aAC5B;SACD,CAAC;IACH,CAAC;CACD,CAAC,CAAC"}
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
import type { BaseEvalOptions } from './types';
|
|
2
|
+
export declare const concisenessPrompt = "You are evaluating whether an LLM response is unnecessarily verbose for the request type.\n\n## Inputs\n\n- USER REQUEST: {{USER_REQUEST}}\n- MODEL RESPONSE: {{MODEL_RESPONSE}}\n\n## Your task\n\n1. Assume a reader who values efficiency and dislikes wasted words.\n2. Assess the complexity and scope of the USER REQUEST to determine appropriate response length.\n3. Identify any of the following verbosity violations in the MODEL RESPONSE:\n - Repeating the same information in different words\n - Excessive preamble before addressing the request\n - Unnecessary summaries or recaps at the end\n - Filler phrases that add no meaning (e.g., \"It's important to note that...\", \"As you may know...\")\n - Over-explanation of simple concepts the user likely understands\n - Including tangential context not required to answer the question\n - Excessive hedging or qualifiers beyond what accuracy requires\n\n## Score\n\n- Start from 1.0.\n- Subtract points for each violation:\n - Minor filler or unnecessary phrase: \u22120.1\n - Redundant paragraph or repeated explanation: \u22120.3\n - Excessive preamble or postamble (multiple sentences): \u22120.3\n - Response length grossly disproportionate to request simplicity: \u22120.5\n- Minimum score is 0.0.\n- Multiple violations compound independently.\n\n## Pass/Fail\n\n- passed = true only if score \u2265 0.7 AND response length is reasonably proportionate to request complexity.\n\n## Constraints\n\n- Do not penalize thoroughness when the request genuinely requires detailed explanation.\n- Do not credit brevity if it sacrifices completeness.\n- Do not assume the user wants verbose responses unless explicitly requested.\n- Judge verbosity relative to what a competent, efficient response would require.\n\n## Output format (STRICT JSON, one line reason):\n\n{\n \"score\": <number between 0.0 and 1.0>,\n \"passed\": <true|false>,\n \"metadata\": {\n \"reason\": \"<single concise sentence describing verbosity issues found or confirming appropriate length>\"\n }\n}";
|
|
3
|
+
type ConcisenessEvalOptions = BaseEvalOptions & {
|
|
4
|
+
threshold: number;
|
|
5
|
+
};
|
|
6
|
+
export declare const conciseness: <TAgentInput extends import("@agentuity/core").StandardSchemaV1 | undefined = any, TAgentOutput extends import("@agentuity/core").StandardSchemaV1 | undefined = any>(overrides?: (Partial<{
|
|
7
|
+
name?: string;
|
|
8
|
+
description?: string;
|
|
9
|
+
} & BaseEvalOptions & {
|
|
10
|
+
threshold: number;
|
|
11
|
+
}> & {
|
|
12
|
+
middleware?: import("./types").EvalMiddleware<TAgentInput extends import("@agentuity/core").StandardSchemaV1<unknown, unknown> ? import("@agentuity/core").InferOutput<TAgentInput> : any, TAgentOutput extends import("@agentuity/core").StandardSchemaV1<unknown, unknown> ? import("@agentuity/core").InferOutput<TAgentOutput> : any, {
|
|
13
|
+
request: string;
|
|
14
|
+
} & {
|
|
15
|
+
context?: string | undefined;
|
|
16
|
+
}, {
|
|
17
|
+
response: string;
|
|
18
|
+
} & {}> | undefined;
|
|
19
|
+
}) | undefined) => import("@agentuity/runtime").CreateEvalConfig<any, any> & {
|
|
20
|
+
name: string;
|
|
21
|
+
options: ConcisenessEvalOptions;
|
|
22
|
+
};
|
|
23
|
+
export {};
|
|
24
|
+
//# sourceMappingURL=conciseness.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"conciseness.d.ts","sourceRoot":"","sources":["../src/conciseness.ts"],"names":[],"mappings":"AAQA,OAAO,KAAK,EAAE,eAAe,EAAE,MAAM,SAAS,CAAC;AAE/C,eAAO,MAAM,iBAAiB,0gEAkD5B,CAAC;AAEH,KAAK,sBAAsB,GAAG,eAAe,GAAG;IAC/C,SAAS,EAAE,MAAM,CAAC;CAClB,CAAC;AAEF,eAAO,MAAM,WAAW;;;;eAHZ,MAAM;;;;;;;;;;;;CAgChB,CAAC"}
|
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
import { openai } from '@ai-sdk/openai';
|
|
2
|
+
import { createPresetEval, interpolatePrompt, generateEvalResult, } from './_utils';
|
|
3
|
+
export const concisenessPrompt = `You are evaluating whether an LLM response is unnecessarily verbose for the request type.
|
|
4
|
+
|
|
5
|
+
## Inputs
|
|
6
|
+
|
|
7
|
+
- USER REQUEST: {{USER_REQUEST}}
|
|
8
|
+
- MODEL RESPONSE: {{MODEL_RESPONSE}}
|
|
9
|
+
|
|
10
|
+
## Your task
|
|
11
|
+
|
|
12
|
+
1. Assume a reader who values efficiency and dislikes wasted words.
|
|
13
|
+
2. Assess the complexity and scope of the USER REQUEST to determine appropriate response length.
|
|
14
|
+
3. Identify any of the following verbosity violations in the MODEL RESPONSE:
|
|
15
|
+
- Repeating the same information in different words
|
|
16
|
+
- Excessive preamble before addressing the request
|
|
17
|
+
- Unnecessary summaries or recaps at the end
|
|
18
|
+
- Filler phrases that add no meaning (e.g., "It's important to note that...", "As you may know...")
|
|
19
|
+
- Over-explanation of simple concepts the user likely understands
|
|
20
|
+
- Including tangential context not required to answer the question
|
|
21
|
+
- Excessive hedging or qualifiers beyond what accuracy requires
|
|
22
|
+
|
|
23
|
+
## Score
|
|
24
|
+
|
|
25
|
+
- Start from 1.0.
|
|
26
|
+
- Subtract points for each violation:
|
|
27
|
+
- Minor filler or unnecessary phrase: −0.1
|
|
28
|
+
- Redundant paragraph or repeated explanation: −0.3
|
|
29
|
+
- Excessive preamble or postamble (multiple sentences): −0.3
|
|
30
|
+
- Response length grossly disproportionate to request simplicity: −0.5
|
|
31
|
+
- Minimum score is 0.0.
|
|
32
|
+
- Multiple violations compound independently.
|
|
33
|
+
|
|
34
|
+
## Pass/Fail
|
|
35
|
+
|
|
36
|
+
- passed = true only if score ≥ 0.7 AND response length is reasonably proportionate to request complexity.
|
|
37
|
+
|
|
38
|
+
## Constraints
|
|
39
|
+
|
|
40
|
+
- Do not penalize thoroughness when the request genuinely requires detailed explanation.
|
|
41
|
+
- Do not credit brevity if it sacrifices completeness.
|
|
42
|
+
- Do not assume the user wants verbose responses unless explicitly requested.
|
|
43
|
+
- Judge verbosity relative to what a competent, efficient response would require.
|
|
44
|
+
|
|
45
|
+
## Output format (STRICT JSON, one line reason):
|
|
46
|
+
|
|
47
|
+
{
|
|
48
|
+
"score": <number between 0.0 and 1.0>,
|
|
49
|
+
"passed": <true|false>,
|
|
50
|
+
"metadata": {
|
|
51
|
+
"reason": "<single concise sentence describing verbosity issues found or confirming appropriate length>"
|
|
52
|
+
}
|
|
53
|
+
}`;
|
|
54
|
+
export const conciseness = createPresetEval({
|
|
55
|
+
name: 'conciseness',
|
|
56
|
+
description: 'Evaluates whether response is appropriately concise without unnecessary verbosity',
|
|
57
|
+
options: {
|
|
58
|
+
model: openai('gpt-4o'),
|
|
59
|
+
threshold: 0.7,
|
|
60
|
+
},
|
|
61
|
+
handler: async (ctx, input, output, options) => {
|
|
62
|
+
const prompt = interpolatePrompt(concisenessPrompt, {
|
|
63
|
+
USER_REQUEST: input.request,
|
|
64
|
+
MODEL_RESPONSE: output.response,
|
|
65
|
+
});
|
|
66
|
+
const evaluation = await generateEvalResult({ model: options.model, prompt });
|
|
67
|
+
return {
|
|
68
|
+
passed: evaluation.passed && (evaluation.score ?? 1) >= options.threshold,
|
|
69
|
+
score: evaluation.score,
|
|
70
|
+
metadata: {
|
|
71
|
+
...evaluation.metadata,
|
|
72
|
+
model: options.model,
|
|
73
|
+
threshold: options.threshold,
|
|
74
|
+
},
|
|
75
|
+
};
|
|
76
|
+
},
|
|
77
|
+
});
|
|
78
|
+
//# sourceMappingURL=conciseness.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"conciseness.js","sourceRoot":"","sources":["../src/conciseness.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,MAAM,EAAE,MAAM,gBAAgB,CAAC;AACxC,OAAO,EACN,gBAAgB,EAChB,iBAAiB,EACjB,kBAAkB,GAGlB,MAAM,UAAU,CAAC;AAGlB,MAAM,CAAC,MAAM,iBAAiB,GAAG;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;EAkD/B,CAAC;AAMH,MAAM,CAAC,MAAM,WAAW,GAAG,gBAAgB,CAIzC;IACD,IAAI,EAAE,aAAa;IACnB,WAAW,EAAE,mFAAmF;IAChG,OAAO,EAAE;QACR,KAAK,EAAE,MAAM,CAAC,QAAQ,CAAC;QACvB,SAAS,EAAE,GAAG;KACd;IACD,OAAO,EAAE,KAAK,EAAE,GAAG,EAAE,KAAK,EAAE,MAAM,EAAE,OAAO,EAAE,EAAE;QAC9C,MAAM,MAAM,GAAG,iBAAiB,CAAC,iBAAiB,EAAE;YACnD,YAAY,EAAE,KAAK,CAAC,OAAO;YAC3B,cAAc,EAAE,MAAM,CAAC,QAAQ;SAC/B,CAAC,CAAC;QAEH,MAAM,UAAU,GAAG,MAAM,kBAAkB,CAAC,EAAE,KAAK,EAAE,OAAO,CAAC,KAAK,EAAE,MAAM,EAAE,CAAC,CAAC;QAE9E,OAAO;YACN,MAAM,EAAE,UAAU,CAAC,MAAM,IAAI,CAAC,UAAU,CAAC,KAAK,IAAI,CAAC,CAAC,IAAI,OAAO,CAAC,SAAS;YACzE,KAAK,EAAE,UAAU,CAAC,KAAK;YACvB,QAAQ,EAAE;gBACT,GAAG,UAAU,CAAC,QAAQ;gBACtB,KAAK,EAAE,OAAO,CAAC,KAAK;gBACpB,SAAS,EAAE,OAAO,CAAC,SAAS;aAC5B;SACD,CAAC;IACH,CAAC;CACD,CAAC,CAAC"}
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
import type { BaseEvalOptions } from './types';
|
|
2
|
+
export declare const extraneousContentPrompt = "You are evaluating whether an LLM response contains content clearly unrelated or unnecessary for fulfilling the request.\n\n## Inputs\n\n- USER REQUEST: {{USER_REQUEST}}\n- MODEL RESPONSE: {{MODEL_RESPONSE}}\n\n## Your task\n\n1. Assume a strict editor who removes anything not directly serving the user's goal.\n2. Identify each distinct section, paragraph, or statement in the MODEL RESPONSE.\n3. For each element, determine whether it contributes to answering the USER REQUEST.\n4. Flag any of the following as extraneous content:\n - Topics or information not requested and not necessary for context\n - Unsolicited advice or recommendations beyond the scope of the request\n - Self-promotional statements about capabilities\n - Unnecessary caveats or warnings unrelated to the specific request\n - Meta-commentary about the response itself (e.g., \"I hope this helps!\")\n - Offers to help with additional unrelated tasks\n - Background information the user clearly already knows based on their request\n - Repeated information already stated elsewhere in the response\n\n## Score\n\n- Start from 1.0.\n- Subtract points for each extraneous element:\n - Brief unnecessary phrase or sentence: \u22120.2\n - Full paragraph of off-topic content: \u22120.4\n - Multiple paragraphs or significant tangent: \u22120.6\n- Minimum score is 0.0.\n- Multiple violations compound independently.\n\n## Pass/Fail\n\n- passed = true only if score \u2265 {{THRESHOLD}} AND no major tangents or significant off-topic content is present.\n\n## Constraints\n\n- Do not credit relevant information for excusing unrelated additions.\n- Do not assume the user wants additional context unless requested.\n- Do not excuse extraneous content because it might be \"useful\" to some readers.\n- Helpful-but-unsolicited content is still extraneous if not requested.\n- Necessary context to understand the answer is not extraneous.\n\n## Output format (STRICT JSON, one line reason):\n\n{\n \"score\": <number between 0.0 and 1.0>,\n \"passed\": <true|false>,\n \"metadata\": {\n \"reason\": \"<single concise sentence listing extraneous content found or confirming all content was relevant>\"\n }\n}";
|
|
3
|
+
type ExtraneousContentEvalOptions = BaseEvalOptions & {
|
|
4
|
+
threshold: number;
|
|
5
|
+
};
|
|
6
|
+
export declare const extraneousContent: <TAgentInput extends import("@agentuity/core").StandardSchemaV1 | undefined = any, TAgentOutput extends import("@agentuity/core").StandardSchemaV1 | undefined = any>(overrides?: (Partial<{
|
|
7
|
+
name?: string;
|
|
8
|
+
description?: string;
|
|
9
|
+
} & BaseEvalOptions & {
|
|
10
|
+
threshold: number;
|
|
11
|
+
}> & {
|
|
12
|
+
middleware?: import("./types").EvalMiddleware<TAgentInput extends import("@agentuity/core").StandardSchemaV1<unknown, unknown> ? import("@agentuity/core").InferOutput<TAgentInput> : any, TAgentOutput extends import("@agentuity/core").StandardSchemaV1<unknown, unknown> ? import("@agentuity/core").InferOutput<TAgentOutput> : any, {
|
|
13
|
+
request: string;
|
|
14
|
+
} & {
|
|
15
|
+
context?: string | undefined;
|
|
16
|
+
}, {
|
|
17
|
+
response: string;
|
|
18
|
+
} & {}> | undefined;
|
|
19
|
+
}) | undefined) => import("@agentuity/runtime").CreateEvalConfig<any, any> & {
|
|
20
|
+
name: string;
|
|
21
|
+
options: ExtraneousContentEvalOptions;
|
|
22
|
+
};
|
|
23
|
+
export {};
|
|
24
|
+
//# sourceMappingURL=extraneous-content.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"extraneous-content.d.ts","sourceRoot":"","sources":["../src/extraneous-content.ts"],"names":[],"mappings":"AAQA,OAAO,KAAK,EAAE,eAAe,EAAE,MAAM,SAAS,CAAC;AAE/C,eAAO,MAAM,uBAAuB,kqEAoDlC,CAAC;AAEH,KAAK,4BAA4B,GAAG,eAAe,GAAG;IACrD,SAAS,EAAE,MAAM,CAAC;CAClB,CAAC;AAEF,eAAO,MAAM,iBAAiB;;;;eAHlB,MAAM;;;;;;;;;;;;CAiChB,CAAC"}
|
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
import { openai } from '@ai-sdk/openai';
|
|
2
|
+
import { createPresetEval, interpolatePrompt, generateEvalResult, } from './_utils';
|
|
3
|
+
export const extraneousContentPrompt = `You are evaluating whether an LLM response contains content clearly unrelated or unnecessary for fulfilling the request.
|
|
4
|
+
|
|
5
|
+
## Inputs
|
|
6
|
+
|
|
7
|
+
- USER REQUEST: {{USER_REQUEST}}
|
|
8
|
+
- MODEL RESPONSE: {{MODEL_RESPONSE}}
|
|
9
|
+
|
|
10
|
+
## Your task
|
|
11
|
+
|
|
12
|
+
1. Assume a strict editor who removes anything not directly serving the user's goal.
|
|
13
|
+
2. Identify each distinct section, paragraph, or statement in the MODEL RESPONSE.
|
|
14
|
+
3. For each element, determine whether it contributes to answering the USER REQUEST.
|
|
15
|
+
4. Flag any of the following as extraneous content:
|
|
16
|
+
- Topics or information not requested and not necessary for context
|
|
17
|
+
- Unsolicited advice or recommendations beyond the scope of the request
|
|
18
|
+
- Self-promotional statements about capabilities
|
|
19
|
+
- Unnecessary caveats or warnings unrelated to the specific request
|
|
20
|
+
- Meta-commentary about the response itself (e.g., "I hope this helps!")
|
|
21
|
+
- Offers to help with additional unrelated tasks
|
|
22
|
+
- Background information the user clearly already knows based on their request
|
|
23
|
+
- Repeated information already stated elsewhere in the response
|
|
24
|
+
|
|
25
|
+
## Score
|
|
26
|
+
|
|
27
|
+
- Start from 1.0.
|
|
28
|
+
- Subtract points for each extraneous element:
|
|
29
|
+
- Brief unnecessary phrase or sentence: −0.2
|
|
30
|
+
- Full paragraph of off-topic content: −0.4
|
|
31
|
+
- Multiple paragraphs or significant tangent: −0.6
|
|
32
|
+
- Minimum score is 0.0.
|
|
33
|
+
- Multiple violations compound independently.
|
|
34
|
+
|
|
35
|
+
## Pass/Fail
|
|
36
|
+
|
|
37
|
+
- passed = true only if score ≥ {{THRESHOLD}} AND no major tangents or significant off-topic content is present.
|
|
38
|
+
|
|
39
|
+
## Constraints
|
|
40
|
+
|
|
41
|
+
- Do not credit relevant information for excusing unrelated additions.
|
|
42
|
+
- Do not assume the user wants additional context unless requested.
|
|
43
|
+
- Do not excuse extraneous content because it might be "useful" to some readers.
|
|
44
|
+
- Helpful-but-unsolicited content is still extraneous if not requested.
|
|
45
|
+
- Necessary context to understand the answer is not extraneous.
|
|
46
|
+
|
|
47
|
+
## Output format (STRICT JSON, one line reason):
|
|
48
|
+
|
|
49
|
+
{
|
|
50
|
+
"score": <number between 0.0 and 1.0>,
|
|
51
|
+
"passed": <true|false>,
|
|
52
|
+
"metadata": {
|
|
53
|
+
"reason": "<single concise sentence listing extraneous content found or confirming all content was relevant>"
|
|
54
|
+
}
|
|
55
|
+
}`;
|
|
56
|
+
export const extraneousContent = createPresetEval({
|
|
57
|
+
name: 'extraneous-content',
|
|
58
|
+
description: 'Evaluates whether response contains unnecessary or off-topic content',
|
|
59
|
+
options: {
|
|
60
|
+
model: openai('gpt-4o'),
|
|
61
|
+
threshold: 0.7,
|
|
62
|
+
},
|
|
63
|
+
handler: async (ctx, input, output, options) => {
|
|
64
|
+
const prompt = interpolatePrompt(extraneousContentPrompt, {
|
|
65
|
+
USER_REQUEST: input.request,
|
|
66
|
+
MODEL_RESPONSE: output.response,
|
|
67
|
+
THRESHOLD: options.threshold.toString(),
|
|
68
|
+
});
|
|
69
|
+
const evaluation = await generateEvalResult({ model: options.model, prompt });
|
|
70
|
+
return {
|
|
71
|
+
passed: evaluation.passed && (evaluation.score ?? 1) >= options.threshold,
|
|
72
|
+
score: evaluation.score,
|
|
73
|
+
metadata: {
|
|
74
|
+
...evaluation.metadata,
|
|
75
|
+
model: options.model,
|
|
76
|
+
threshold: options.threshold,
|
|
77
|
+
},
|
|
78
|
+
};
|
|
79
|
+
},
|
|
80
|
+
});
|
|
81
|
+
//# sourceMappingURL=extraneous-content.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"extraneous-content.js","sourceRoot":"","sources":["../src/extraneous-content.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,MAAM,EAAE,MAAM,gBAAgB,CAAC;AACxC,OAAO,EACN,gBAAgB,EAChB,iBAAiB,EACjB,kBAAkB,GAGlB,MAAM,UAAU,CAAC;AAGlB,MAAM,CAAC,MAAM,uBAAuB,GAAG;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;EAoDrC,CAAC;AAMH,MAAM,CAAC,MAAM,iBAAiB,GAAG,gBAAgB,CAI/C;IACD,IAAI,EAAE,oBAAoB;IAC1B,WAAW,EAAE,sEAAsE;IACnF,OAAO,EAAE;QACR,KAAK,EAAE,MAAM,CAAC,QAAQ,CAAC;QACvB,SAAS,EAAE,GAAG;KACd;IACD,OAAO,EAAE,KAAK,EAAE,GAAG,EAAE,KAAK,EAAE,MAAM,EAAE,OAAO,EAAE,EAAE;QAC9C,MAAM,MAAM,GAAG,iBAAiB,CAAC,uBAAuB,EAAE;YACzD,YAAY,EAAE,KAAK,CAAC,OAAO;YAC3B,cAAc,EAAE,MAAM,CAAC,QAAQ;YAC/B,SAAS,EAAE,OAAO,CAAC,SAAS,CAAC,QAAQ,EAAE;SACvC,CAAC,CAAC;QAEH,MAAM,UAAU,GAAG,MAAM,kBAAkB,CAAC,EAAE,KAAK,EAAE,OAAO,CAAC,KAAK,EAAE,MAAM,EAAE,CAAC,CAAC;QAE9E,OAAO;YACN,MAAM,EAAE,UAAU,CAAC,MAAM,IAAI,CAAC,UAAU,CAAC,KAAK,IAAI,CAAC,CAAC,IAAI,OAAO,CAAC,SAAS;YACzE,KAAK,EAAE,UAAU,CAAC,KAAK;YACvB,QAAQ,EAAE;gBACT,GAAG,UAAU,CAAC,QAAQ;gBACtB,KAAK,EAAE,OAAO,CAAC,KAAK;gBACpB,SAAS,EAAE,OAAO,CAAC,SAAS;aAC5B;SACD,CAAC;IACH,CAAC;CACD,CAAC,CAAC"}
|
package/dist/format.d.ts
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
import type { BaseEvalOptions } from './types';
|
|
2
|
+
export declare const formatPrompt = "You are evaluating whether an LLM response matches the format requested by the user.\n\n## Inputs\n\n- USER REQUEST: {{USER_REQUEST}}\n- MODEL RESPONSE: {{MODEL_RESPONSE}}\n\n## Your task\n\n1. Assume a strict validator checking format compliance.\n2. Identify any explicit format requirements in the USER REQUEST, including:\n - Structured data formats: JSON, XML, YAML, CSV, etc.\n - Document formats: Markdown, plain text, HTML, etc.\n - List formats: Bullet points, numbered lists, comma-separated, etc.\n - Table formats: Markdown tables, ASCII tables, etc.\n - Code formats: Specific programming language, code blocks, etc.\n - Length constraints: Word counts, character limits, number of items, etc.\n - Structural requirements: Sections, headers, specific fields, etc.\n3. If no format is explicitly requested, this eval automatically passes.\n4. If a format is requested, verify the MODEL RESPONSE strictly adheres to it:\n - JSON must be valid, parseable JSON\n - Lists must use the specified list style\n - Tables must have proper structure\n - Code must be in the specified language and properly formatted\n - Length constraints must be met exactly or within stated tolerance\n\n## Pass/Fail\n\n- passed = true only if no format was requested OR the response strictly matches all requested format requirements.\n- passed = false if any format requirement is violated, even partially.\n\n## Constraints\n\n- Do not assume implicit format preferences; only enforce explicit requests.\n- Do not credit \"close enough\" formatting; requirements must be met exactly.\n- Do not excuse format violations because the content is otherwise correct.\n- Do not pass responses that wrap requested format in additional commentary unless explicitly allowed.\n- JSON responses with syntax errors (trailing commas, unquoted keys, etc.) are failures.\n\n## Output format (STRICT JSON, one line reason):\n\n{\n \"passed\": <true|false>,\n \"metadata\": {\n \"reason\": \"<single concise sentence stating format requirement and whether it was met, or confirming no format was requested>\"\n }\n}";
|
|
3
|
+
export declare const format: <TAgentInput extends import("@agentuity/core").StandardSchemaV1 | undefined = any, TAgentOutput extends import("@agentuity/core").StandardSchemaV1 | undefined = any>(overrides?: (Partial<{
|
|
4
|
+
name?: string;
|
|
5
|
+
description?: string;
|
|
6
|
+
} & BaseEvalOptions> & {
|
|
7
|
+
middleware?: import("./types").EvalMiddleware<TAgentInput extends import("@agentuity/core").StandardSchemaV1<unknown, unknown> ? import("@agentuity/core").InferOutput<TAgentInput> : any, TAgentOutput extends import("@agentuity/core").StandardSchemaV1<unknown, unknown> ? import("@agentuity/core").InferOutput<TAgentOutput> : any, {
|
|
8
|
+
request: string;
|
|
9
|
+
} & {
|
|
10
|
+
context?: string | undefined;
|
|
11
|
+
}, {
|
|
12
|
+
response: string;
|
|
13
|
+
} & {}> | undefined;
|
|
14
|
+
}) | undefined) => import("@agentuity/runtime").CreateEvalConfig<any, any> & {
|
|
15
|
+
name: string;
|
|
16
|
+
options: BaseEvalOptions;
|
|
17
|
+
};
|
|
18
|
+
//# sourceMappingURL=format.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"format.d.ts","sourceRoot":"","sources":["../src/format.ts"],"names":[],"mappings":"AAQA,OAAO,KAAK,EAAE,eAAe,EAAE,MAAM,SAAS,CAAC;AAE/C,eAAO,MAAM,YAAY,0kEA8CvB,CAAC;AAEH,eAAO,MAAM,MAAM;;;;;;;;;;;;;;CAsBjB,CAAC"}
|
package/dist/format.js
ADDED
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
import { openai } from '@ai-sdk/openai';
|
|
2
|
+
import { createPresetEval, interpolatePrompt, generateEvalResult, } from './_utils';
|
|
3
|
+
export const formatPrompt = `You are evaluating whether an LLM response matches the format requested by the user.
|
|
4
|
+
|
|
5
|
+
## Inputs
|
|
6
|
+
|
|
7
|
+
- USER REQUEST: {{USER_REQUEST}}
|
|
8
|
+
- MODEL RESPONSE: {{MODEL_RESPONSE}}
|
|
9
|
+
|
|
10
|
+
## Your task
|
|
11
|
+
|
|
12
|
+
1. Assume a strict validator checking format compliance.
|
|
13
|
+
2. Identify any explicit format requirements in the USER REQUEST, including:
|
|
14
|
+
- Structured data formats: JSON, XML, YAML, CSV, etc.
|
|
15
|
+
- Document formats: Markdown, plain text, HTML, etc.
|
|
16
|
+
- List formats: Bullet points, numbered lists, comma-separated, etc.
|
|
17
|
+
- Table formats: Markdown tables, ASCII tables, etc.
|
|
18
|
+
- Code formats: Specific programming language, code blocks, etc.
|
|
19
|
+
- Length constraints: Word counts, character limits, number of items, etc.
|
|
20
|
+
- Structural requirements: Sections, headers, specific fields, etc.
|
|
21
|
+
3. If no format is explicitly requested, this eval automatically passes.
|
|
22
|
+
4. If a format is requested, verify the MODEL RESPONSE strictly adheres to it:
|
|
23
|
+
- JSON must be valid, parseable JSON
|
|
24
|
+
- Lists must use the specified list style
|
|
25
|
+
- Tables must have proper structure
|
|
26
|
+
- Code must be in the specified language and properly formatted
|
|
27
|
+
- Length constraints must be met exactly or within stated tolerance
|
|
28
|
+
|
|
29
|
+
## Pass/Fail
|
|
30
|
+
|
|
31
|
+
- passed = true only if no format was requested OR the response strictly matches all requested format requirements.
|
|
32
|
+
- passed = false if any format requirement is violated, even partially.
|
|
33
|
+
|
|
34
|
+
## Constraints
|
|
35
|
+
|
|
36
|
+
- Do not assume implicit format preferences; only enforce explicit requests.
|
|
37
|
+
- Do not credit "close enough" formatting; requirements must be met exactly.
|
|
38
|
+
- Do not excuse format violations because the content is otherwise correct.
|
|
39
|
+
- Do not pass responses that wrap requested format in additional commentary unless explicitly allowed.
|
|
40
|
+
- JSON responses with syntax errors (trailing commas, unquoted keys, etc.) are failures.
|
|
41
|
+
|
|
42
|
+
## Output format (STRICT JSON, one line reason):
|
|
43
|
+
|
|
44
|
+
{
|
|
45
|
+
"passed": <true|false>,
|
|
46
|
+
"metadata": {
|
|
47
|
+
"reason": "<single concise sentence stating format requirement and whether it was met, or confirming no format was requested>"
|
|
48
|
+
}
|
|
49
|
+
}`;
|
|
50
|
+
export const format = createPresetEval({
|
|
51
|
+
name: 'format',
|
|
52
|
+
description: 'Evaluates whether response matches the requested format',
|
|
53
|
+
options: {
|
|
54
|
+
model: openai('gpt-4o'),
|
|
55
|
+
},
|
|
56
|
+
handler: async (ctx, input, output, options) => {
|
|
57
|
+
const prompt = interpolatePrompt(formatPrompt, {
|
|
58
|
+
USER_REQUEST: input.request,
|
|
59
|
+
MODEL_RESPONSE: output.response,
|
|
60
|
+
});
|
|
61
|
+
const evaluation = await generateEvalResult({ model: options.model, prompt });
|
|
62
|
+
return {
|
|
63
|
+
passed: evaluation.passed,
|
|
64
|
+
metadata: {
|
|
65
|
+
...evaluation.metadata,
|
|
66
|
+
model: options.model,
|
|
67
|
+
},
|
|
68
|
+
};
|
|
69
|
+
},
|
|
70
|
+
});
|
|
71
|
+
//# sourceMappingURL=format.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"format.js","sourceRoot":"","sources":["../src/format.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,MAAM,EAAE,MAAM,gBAAgB,CAAC;AACxC,OAAO,EACN,gBAAgB,EAChB,iBAAiB,EACjB,kBAAkB,GAGlB,MAAM,UAAU,CAAC;AAGlB,MAAM,CAAC,MAAM,YAAY,GAAG;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;EA8C1B,CAAC;AAEH,MAAM,CAAC,MAAM,MAAM,GAAG,gBAAgB,CAAuD;IAC5F,IAAI,EAAE,QAAQ;IACd,WAAW,EAAE,yDAAyD;IACtE,OAAO,EAAE;QACR,KAAK,EAAE,MAAM,CAAC,QAAQ,CAAC;KACvB;IACD,OAAO,EAAE,KAAK,EAAE,GAAG,EAAE,KAAK,EAAE,MAAM,EAAE,OAAO,EAAE,EAAE;QAC9C,MAAM,MAAM,GAAG,iBAAiB,CAAC,YAAY,EAAE;YAC9C,YAAY,EAAE,KAAK,CAAC,OAAO;YAC3B,cAAc,EAAE,MAAM,CAAC,QAAQ;SAC/B,CAAC,CAAC;QAEH,MAAM,UAAU,GAAG,MAAM,kBAAkB,CAAC,EAAE,KAAK,EAAE,OAAO,CAAC,KAAK,EAAE,MAAM,EAAE,CAAC,CAAC;QAE9E,OAAO;YACN,MAAM,EAAE,UAAU,CAAC,MAAM;YACzB,QAAQ,EAAE;gBACT,GAAG,UAAU,CAAC,QAAQ;gBACtB,KAAK,EAAE,OAAO,CAAC,KAAK;aACpB;SACD,CAAC;IACH,CAAC;CACD,CAAC,CAAC"}
|
package/dist/index.d.ts
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
export { createPresetEval, interpolatePrompt, generateEvalResult, type DefaultEvalInput, type DefaultEvalOutput, type GenerateEvalResultOptions, } from './_utils';
|
|
2
|
+
export type { BaseEvalOptions, EvalMiddleware } from './types';
|
|
3
|
+
export { politeness, politenessPrompt } from './politeness';
|
|
4
|
+
export { safety, safetyPrompt } from './safety';
|
|
5
|
+
export { pii, piiPrompt } from './pii';
|
|
6
|
+
export { conciseness, concisenessPrompt } from './conciseness';
|
|
7
|
+
export { adversarial, adversarialPrompt } from './adversarial';
|
|
8
|
+
export { ambiguity, ambiguityPrompt } from './ambiguity';
|
|
9
|
+
export { answerCompleteness, answerCompletenessPrompt } from './answer-completeness';
|
|
10
|
+
export { extraneousContent, extraneousContentPrompt } from './extraneous-content';
|
|
11
|
+
export { format, formatPrompt } from './format';
|
|
12
|
+
export { knowledgeRetention, knowledgeRetentionPrompt } from './knowledge-retention';
|
|
13
|
+
export { roleAdherence, roleAdherencePrompt } from './role-adherence';
|
|
14
|
+
export { selfReference, selfReferencePrompt } from './self-reference';
|
|
15
|
+
//# sourceMappingURL=index.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAAA,OAAO,EACN,gBAAgB,EAChB,iBAAiB,EACjB,kBAAkB,EAClB,KAAK,gBAAgB,EACrB,KAAK,iBAAiB,EACtB,KAAK,yBAAyB,GAC9B,MAAM,UAAU,CAAC;AAClB,YAAY,EAAE,eAAe,EAAE,cAAc,EAAE,MAAM,SAAS,CAAC;AAG/D,OAAO,EAAE,UAAU,EAAE,gBAAgB,EAAE,MAAM,cAAc,CAAC;AAC5D,OAAO,EAAE,MAAM,EAAE,YAAY,EAAE,MAAM,UAAU,CAAC;AAChD,OAAO,EAAE,GAAG,EAAE,SAAS,EAAE,MAAM,OAAO,CAAC;AACvC,OAAO,EAAE,WAAW,EAAE,iBAAiB,EAAE,MAAM,eAAe,CAAC;AAC/D,OAAO,EAAE,WAAW,EAAE,iBAAiB,EAAE,MAAM,eAAe,CAAC;AAC/D,OAAO,EAAE,SAAS,EAAE,eAAe,EAAE,MAAM,aAAa,CAAC;AACzD,OAAO,EAAE,kBAAkB,EAAE,wBAAwB,EAAE,MAAM,uBAAuB,CAAC;AACrF,OAAO,EAAE,iBAAiB,EAAE,uBAAuB,EAAE,MAAM,sBAAsB,CAAC;AAClF,OAAO,EAAE,MAAM,EAAE,YAAY,EAAE,MAAM,UAAU,CAAC;AAChD,OAAO,EAAE,kBAAkB,EAAE,wBAAwB,EAAE,MAAM,uBAAuB,CAAC;AACrF,OAAO,EAAE,aAAa,EAAE,mBAAmB,EAAE,MAAM,kBAAkB,CAAC;AACtE,OAAO,EAAE,aAAa,EAAE,mBAAmB,EAAE,MAAM,kBAAkB,CAAC"}
|
package/dist/index.js
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
export { createPresetEval, interpolatePrompt, generateEvalResult, } from './_utils';
|
|
2
|
+
// Evals (each file contains both the prompt and the eval)
|
|
3
|
+
export { politeness, politenessPrompt } from './politeness';
|
|
4
|
+
export { safety, safetyPrompt } from './safety';
|
|
5
|
+
export { pii, piiPrompt } from './pii';
|
|
6
|
+
export { conciseness, concisenessPrompt } from './conciseness';
|
|
7
|
+
export { adversarial, adversarialPrompt } from './adversarial';
|
|
8
|
+
export { ambiguity, ambiguityPrompt } from './ambiguity';
|
|
9
|
+
export { answerCompleteness, answerCompletenessPrompt } from './answer-completeness';
|
|
10
|
+
export { extraneousContent, extraneousContentPrompt } from './extraneous-content';
|
|
11
|
+
export { format, formatPrompt } from './format';
|
|
12
|
+
export { knowledgeRetention, knowledgeRetentionPrompt } from './knowledge-retention';
|
|
13
|
+
export { roleAdherence, roleAdherencePrompt } from './role-adherence';
|
|
14
|
+
export { selfReference, selfReferencePrompt } from './self-reference';
|
|
15
|
+
//# sourceMappingURL=index.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.js","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAAA,OAAO,EACN,gBAAgB,EAChB,iBAAiB,EACjB,kBAAkB,GAIlB,MAAM,UAAU,CAAC;AAGlB,0DAA0D;AAC1D,OAAO,EAAE,UAAU,EAAE,gBAAgB,EAAE,MAAM,cAAc,CAAC;AAC5D,OAAO,EAAE,MAAM,EAAE,YAAY,EAAE,MAAM,UAAU,CAAC;AAChD,OAAO,EAAE,GAAG,EAAE,SAAS,EAAE,MAAM,OAAO,CAAC;AACvC,OAAO,EAAE,WAAW,EAAE,iBAAiB,EAAE,MAAM,eAAe,CAAC;AAC/D,OAAO,EAAE,WAAW,EAAE,iBAAiB,EAAE,MAAM,eAAe,CAAC;AAC/D,OAAO,EAAE,SAAS,EAAE,eAAe,EAAE,MAAM,aAAa,CAAC;AACzD,OAAO,EAAE,kBAAkB,EAAE,wBAAwB,EAAE,MAAM,uBAAuB,CAAC;AACrF,OAAO,EAAE,iBAAiB,EAAE,uBAAuB,EAAE,MAAM,sBAAsB,CAAC;AAClF,OAAO,EAAE,MAAM,EAAE,YAAY,EAAE,MAAM,UAAU,CAAC;AAChD,OAAO,EAAE,kBAAkB,EAAE,wBAAwB,EAAE,MAAM,uBAAuB,CAAC;AACrF,OAAO,EAAE,aAAa,EAAE,mBAAmB,EAAE,MAAM,kBAAkB,CAAC;AACtE,OAAO,EAAE,aAAa,EAAE,mBAAmB,EAAE,MAAM,kBAAkB,CAAC"}
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
import type { BaseEvalOptions } from './types';
|
|
2
|
+
export declare const knowledgeRetentionPrompt = "You are evaluating whether an LLM response correctly retains and applies facts or decisions provided earlier in the conversation.\n\n## Inputs\n\n- USER REQUEST (may include conversation history or prior context): {{USER_REQUEST}}\n- MODEL RESPONSE: {{MODEL_RESPONSE}}\n\n## Your task\n\n1. Assume an auditor checking for context consistency and memory accuracy.\n2. Identify all facts, decisions, preferences, or constraints established in the USER REQUEST or prior context, including:\n - Names, dates, numbers, or specific values mentioned\n - User preferences or requirements stated earlier\n - Decisions or conclusions reached in prior exchanges\n - Constraints or boundaries defined for the task\n - Corrections or clarifications the user provided\n3. Check the MODEL RESPONSE for any of the following retention failures:\n - Contradicting previously established facts\n - Ignoring stated preferences or requirements\n - Using incorrect values for previously defined variables\n - Forgetting constraints that should limit the response\n - Asking for information already provided\n - Reverting to defaults when specific choices were made\n\n## Score\n\n- Start from 1.0.\n- Subtract points for each retention failure:\n - Minor detail forgotten (peripheral to main task): \u22120.2\n - Preference or requirement ignored: \u22120.4\n - Key fact contradicted or misremembered: \u22120.5\n - Critical constraint violated: \u22120.6\n- Minimum score is 0.0.\n- Multiple failures compound independently.\n\n## Pass/Fail\n\n- passed = true only if score \u2265 0.7 AND no critical facts are contradicted or key constraints violated.\n\n## Constraints\n\n- Do not assume the response \"probably meant\" the correct information.\n- Do not excuse retention failures because the response is otherwise helpful.\n- Do not credit partial retention if critical elements are missed.\n- If no prior context is provided, this eval automatically passes with score 1.0.\n- Evaluate only retention of information explicitly stated, not implied.\n\n## Output format (STRICT JSON, one line reason):\n\n{\n \"score\": <number between 0.0 and 1.0>,\n \"passed\": <true|false>,\n \"metadata\": {\n \"reason\": \"<single concise sentence listing retention failures found or confirming context was correctly maintained>\"\n }\n}";
|
|
3
|
+
type KnowledgeRetentionEvalOptions = BaseEvalOptions & {
|
|
4
|
+
threshold: number;
|
|
5
|
+
};
|
|
6
|
+
export declare const knowledgeRetention: <TAgentInput extends import("@agentuity/core").StandardSchemaV1 | undefined = any, TAgentOutput extends import("@agentuity/core").StandardSchemaV1 | undefined = any>(overrides?: (Partial<{
|
|
7
|
+
name?: string;
|
|
8
|
+
description?: string;
|
|
9
|
+
} & BaseEvalOptions & {
|
|
10
|
+
threshold: number;
|
|
11
|
+
}> & {
|
|
12
|
+
middleware?: import("./types").EvalMiddleware<TAgentInput extends import("@agentuity/core").StandardSchemaV1<unknown, unknown> ? import("@agentuity/core").InferOutput<TAgentInput> : any, TAgentOutput extends import("@agentuity/core").StandardSchemaV1<unknown, unknown> ? import("@agentuity/core").InferOutput<TAgentOutput> : any, {
|
|
13
|
+
request: string;
|
|
14
|
+
} & {
|
|
15
|
+
context?: string | undefined;
|
|
16
|
+
}, {
|
|
17
|
+
response: string;
|
|
18
|
+
} & {}> | undefined;
|
|
19
|
+
}) | undefined) => import("@agentuity/runtime").CreateEvalConfig<any, any> & {
|
|
20
|
+
name: string;
|
|
21
|
+
options: KnowledgeRetentionEvalOptions;
|
|
22
|
+
};
|
|
23
|
+
export {};
|
|
24
|
+
//# sourceMappingURL=knowledge-retention.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"knowledge-retention.d.ts","sourceRoot":"","sources":["../src/knowledge-retention.ts"],"names":[],"mappings":"AAQA,OAAO,KAAK,EAAE,eAAe,EAAE,MAAM,SAAS,CAAC;AAE/C,eAAO,MAAM,wBAAwB,4yEAuDnC,CAAC;AAEH,KAAK,6BAA6B,GAAG,eAAe,GAAG;IACtD,SAAS,EAAE,MAAM,CAAC;CAClB,CAAC;AAEF,eAAO,MAAM,kBAAkB;;;;eAHnB,MAAM;;;;;;;;;;;;CAgChB,CAAC"}
|
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
import { openai } from '@ai-sdk/openai';
|
|
2
|
+
import { createPresetEval, interpolatePrompt, generateEvalResult, } from './_utils';
|
|
3
|
+
export const knowledgeRetentionPrompt = `You are evaluating whether an LLM response correctly retains and applies facts or decisions provided earlier in the conversation.
|
|
4
|
+
|
|
5
|
+
## Inputs
|
|
6
|
+
|
|
7
|
+
- USER REQUEST (may include conversation history or prior context): {{USER_REQUEST}}
|
|
8
|
+
- MODEL RESPONSE: {{MODEL_RESPONSE}}
|
|
9
|
+
|
|
10
|
+
## Your task
|
|
11
|
+
|
|
12
|
+
1. Assume an auditor checking for context consistency and memory accuracy.
|
|
13
|
+
2. Identify all facts, decisions, preferences, or constraints established in the USER REQUEST or prior context, including:
|
|
14
|
+
- Names, dates, numbers, or specific values mentioned
|
|
15
|
+
- User preferences or requirements stated earlier
|
|
16
|
+
- Decisions or conclusions reached in prior exchanges
|
|
17
|
+
- Constraints or boundaries defined for the task
|
|
18
|
+
- Corrections or clarifications the user provided
|
|
19
|
+
3. Check the MODEL RESPONSE for any of the following retention failures:
|
|
20
|
+
- Contradicting previously established facts
|
|
21
|
+
- Ignoring stated preferences or requirements
|
|
22
|
+
- Using incorrect values for previously defined variables
|
|
23
|
+
- Forgetting constraints that should limit the response
|
|
24
|
+
- Asking for information already provided
|
|
25
|
+
- Reverting to defaults when specific choices were made
|
|
26
|
+
|
|
27
|
+
## Score
|
|
28
|
+
|
|
29
|
+
- Start from 1.0.
|
|
30
|
+
- Subtract points for each retention failure:
|
|
31
|
+
- Minor detail forgotten (peripheral to main task): −0.2
|
|
32
|
+
- Preference or requirement ignored: −0.4
|
|
33
|
+
- Key fact contradicted or misremembered: −0.5
|
|
34
|
+
- Critical constraint violated: −0.6
|
|
35
|
+
- Minimum score is 0.0.
|
|
36
|
+
- Multiple failures compound independently.
|
|
37
|
+
|
|
38
|
+
## Pass/Fail
|
|
39
|
+
|
|
40
|
+
- passed = true only if score ≥ 0.7 AND no critical facts are contradicted or key constraints violated.
|
|
41
|
+
|
|
42
|
+
## Constraints
|
|
43
|
+
|
|
44
|
+
- Do not assume the response "probably meant" the correct information.
|
|
45
|
+
- Do not excuse retention failures because the response is otherwise helpful.
|
|
46
|
+
- Do not credit partial retention if critical elements are missed.
|
|
47
|
+
- If no prior context is provided, this eval automatically passes with score 1.0.
|
|
48
|
+
- Evaluate only retention of information explicitly stated, not implied.
|
|
49
|
+
|
|
50
|
+
## Output format (STRICT JSON, one line reason):
|
|
51
|
+
|
|
52
|
+
{
|
|
53
|
+
"score": <number between 0.0 and 1.0>,
|
|
54
|
+
"passed": <true|false>,
|
|
55
|
+
"metadata": {
|
|
56
|
+
"reason": "<single concise sentence listing retention failures found or confirming context was correctly maintained>"
|
|
57
|
+
}
|
|
58
|
+
}`;
|
|
59
|
+
export const knowledgeRetention = createPresetEval({
|
|
60
|
+
name: 'knowledge-retention',
|
|
61
|
+
description: 'Evaluates whether response correctly retains context from earlier in conversation',
|
|
62
|
+
options: {
|
|
63
|
+
model: openai('gpt-4o'),
|
|
64
|
+
threshold: 0.7,
|
|
65
|
+
},
|
|
66
|
+
handler: async (ctx, input, output, options) => {
|
|
67
|
+
const prompt = interpolatePrompt(knowledgeRetentionPrompt, {
|
|
68
|
+
USER_REQUEST: input.request,
|
|
69
|
+
MODEL_RESPONSE: output.response,
|
|
70
|
+
});
|
|
71
|
+
const evaluation = await generateEvalResult({ model: options.model, prompt });
|
|
72
|
+
return {
|
|
73
|
+
passed: evaluation.passed && (evaluation.score ?? 1) >= options.threshold,
|
|
74
|
+
score: evaluation.score,
|
|
75
|
+
metadata: {
|
|
76
|
+
...evaluation.metadata,
|
|
77
|
+
model: options.model,
|
|
78
|
+
threshold: options.threshold,
|
|
79
|
+
},
|
|
80
|
+
};
|
|
81
|
+
},
|
|
82
|
+
});
|
|
83
|
+
//# sourceMappingURL=knowledge-retention.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"knowledge-retention.js","sourceRoot":"","sources":["../src/knowledge-retention.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,MAAM,EAAE,MAAM,gBAAgB,CAAC;AACxC,OAAO,EACN,gBAAgB,EAChB,iBAAiB,EACjB,kBAAkB,GAGlB,MAAM,UAAU,CAAC;AAGlB,MAAM,CAAC,MAAM,wBAAwB,GAAG;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;EAuDtC,CAAC;AAMH,MAAM,CAAC,MAAM,kBAAkB,GAAG,gBAAgB,CAIhD;IACD,IAAI,EAAE,qBAAqB;IAC3B,WAAW,EAAE,mFAAmF;IAChG,OAAO,EAAE;QACR,KAAK,EAAE,MAAM,CAAC,QAAQ,CAAC;QACvB,SAAS,EAAE,GAAG;KACd;IACD,OAAO,EAAE,KAAK,EAAE,GAAG,EAAE,KAAK,EAAE,MAAM,EAAE,OAAO,EAAE,EAAE;QAC9C,MAAM,MAAM,GAAG,iBAAiB,CAAC,wBAAwB,EAAE;YAC1D,YAAY,EAAE,KAAK,CAAC,OAAO;YAC3B,cAAc,EAAE,MAAM,CAAC,QAAQ;SAC/B,CAAC,CAAC;QAEH,MAAM,UAAU,GAAG,MAAM,kBAAkB,CAAC,EAAE,KAAK,EAAE,OAAO,CAAC,KAAK,EAAE,MAAM,EAAE,CAAC,CAAC;QAE9E,OAAO;YACN,MAAM,EAAE,UAAU,CAAC,MAAM,IAAI,CAAC,UAAU,CAAC,KAAK,IAAI,CAAC,CAAC,IAAI,OAAO,CAAC,SAAS;YACzE,KAAK,EAAE,UAAU,CAAC,KAAK;YACvB,QAAQ,EAAE;gBACT,GAAG,UAAU,CAAC,QAAQ;gBACtB,KAAK,EAAE,OAAO,CAAC,KAAK;gBACpB,SAAS,EAAE,OAAO,CAAC,SAAS;aAC5B;SACD,CAAC;IACH,CAAC;CACD,CAAC,CAAC"}
|