deepeval 0.1.17
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +162 -0
- package/dist/annotation/api.d.ts +15 -0
- package/dist/annotation/api.js +8 -0
- package/dist/annotation/index.d.ts +3 -0
- package/dist/annotation/index.js +36 -0
- package/dist/annotation/utils.d.ts +2 -0
- package/dist/annotation/utils.js +34 -0
- package/dist/confident/api.d.ts +40 -0
- package/dist/confident/api.js +206 -0
- package/dist/confident/evaluate.d.ts +11 -0
- package/dist/confident/evaluate.js +160 -0
- package/dist/confident/index.d.ts +6 -0
- package/dist/confident/index.js +24 -0
- package/dist/confident/types.d.ts +13 -0
- package/dist/confident/types.js +2 -0
- package/dist/config/settings.d.ts +11 -0
- package/dist/config/settings.js +30 -0
- package/dist/constants.d.ts +4 -0
- package/dist/constants.js +7 -0
- package/dist/dataset/api.d.ts +15 -0
- package/dist/dataset/api.js +2 -0
- package/dist/dataset/dataset.d.ts +54 -0
- package/dist/dataset/dataset.js +289 -0
- package/dist/dataset/golden.d.ts +61 -0
- package/dist/dataset/golden.js +65 -0
- package/dist/dataset/index.d.ts +7 -0
- package/dist/dataset/index.js +23 -0
- package/dist/dataset/utils.d.ts +9 -0
- package/dist/dataset/utils.js +116 -0
- package/dist/index.d.ts +13 -0
- package/dist/index.js +68 -0
- package/dist/integrations/ai-sdk/index.d.ts +29 -0
- package/dist/integrations/ai-sdk/index.js +121 -0
- package/dist/integrations/ai-sdk/processor.d.ts +17 -0
- package/dist/integrations/ai-sdk/processor.js +260 -0
- package/dist/integrations/index.d.ts +2 -0
- package/dist/integrations/index.js +7 -0
- package/dist/integrations/langchain/callback-handler.d.ts +36 -0
- package/dist/integrations/langchain/callback-handler.js +236 -0
- package/dist/integrations/langchain/index.d.ts +1 -0
- package/dist/integrations/langchain/index.js +5 -0
- package/dist/integrations/langchain/patch-tool.d.ts +1 -0
- package/dist/integrations/langchain/patch-tool.js +56 -0
- package/dist/integrations/langchain/utils.d.ts +49 -0
- package/dist/integrations/langchain/utils.js +266 -0
- package/dist/metrics/base-metrics.d.ts +30 -0
- package/dist/metrics/base-metrics.js +36 -0
- package/dist/models/base-model.d.ts +34 -0
- package/dist/models/base-model.js +27 -0
- package/dist/models/index.d.ts +1 -0
- package/dist/models/index.js +5 -0
- package/dist/openai/extractor.d.ts +9 -0
- package/dist/openai/extractor.js +140 -0
- package/dist/openai/index.d.ts +2 -0
- package/dist/openai/index.js +12 -0
- package/dist/openai/patch.d.ts +3 -0
- package/dist/openai/patch.js +147 -0
- package/dist/openai/types.d.ts +15 -0
- package/dist/openai/types.js +2 -0
- package/dist/openai/utils.d.ts +7 -0
- package/dist/openai/utils.js +174 -0
- package/dist/prompt/index.d.ts +61 -0
- package/dist/prompt/index.js +301 -0
- package/dist/prompt/types.d.ts +51 -0
- package/dist/prompt/types.js +157 -0
- package/dist/prompt/utils.d.ts +20 -0
- package/dist/prompt/utils.js +175 -0
- package/dist/simulate/index.d.ts +29 -0
- package/dist/simulate/index.js +176 -0
- package/dist/telemetry.d.ts +13 -0
- package/dist/telemetry.js +322 -0
- package/dist/test-case/index.d.ts +1 -0
- package/dist/test-case/index.js +12 -0
- package/dist/test-case/llm-test-case.d.ts +120 -0
- package/dist/test-case/llm-test-case.js +181 -0
- package/dist/test-case/utils.d.ts +13 -0
- package/dist/test-case/utils.js +33 -0
- package/dist/tracing/api.d.ts +91 -0
- package/dist/tracing/api.js +16 -0
- package/dist/tracing/index.d.ts +4 -0
- package/dist/tracing/index.js +19 -0
- package/dist/tracing/logging.d.ts +12 -0
- package/dist/tracing/logging.js +44 -0
- package/dist/tracing/offline-evals/api.d.ts +7 -0
- package/dist/tracing/offline-evals/api.js +17 -0
- package/dist/tracing/offline-evals/index.d.ts +3 -0
- package/dist/tracing/offline-evals/index.js +9 -0
- package/dist/tracing/offline-evals/span.d.ts +4 -0
- package/dist/tracing/offline-evals/span.js +18 -0
- package/dist/tracing/offline-evals/thread.d.ts +4 -0
- package/dist/tracing/offline-evals/thread.js +19 -0
- package/dist/tracing/offline-evals/trace.d.ts +4 -0
- package/dist/tracing/offline-evals/trace.js +18 -0
- package/dist/tracing/trace-context.d.ts +26 -0
- package/dist/tracing/trace-context.js +59 -0
- package/dist/tracing/tracing.d.ts +328 -0
- package/dist/tracing/tracing.js +1085 -0
- package/dist/tracing/utils.d.ts +11 -0
- package/dist/tracing/utils.js +45 -0
- package/dist/utils.d.ts +22 -0
- package/dist/utils.js +84 -0
- package/package.json +135 -0
|
@@ -0,0 +1,160 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
exports.evaluate = evaluate;
|
|
4
|
+
const api_1 = require("./api");
|
|
5
|
+
const utils_1 = require("../utils");
|
|
6
|
+
const prompt_1 = require("../prompt");
|
|
7
|
+
function convertTurn(turn) {
|
|
8
|
+
const toolsCalled = turn.toolsCalled
|
|
9
|
+
? turn.toolsCalled.map((tool) => ({
|
|
10
|
+
name: tool.name,
|
|
11
|
+
description: tool.description,
|
|
12
|
+
reasoning: tool.reasoning,
|
|
13
|
+
output: tool.output,
|
|
14
|
+
inputParameters: tool.inputParameters,
|
|
15
|
+
}))
|
|
16
|
+
: undefined;
|
|
17
|
+
return {
|
|
18
|
+
role: turn.role,
|
|
19
|
+
content: turn.content,
|
|
20
|
+
userId: turn.userId,
|
|
21
|
+
retrievalContext: turn.retrievalContext,
|
|
22
|
+
toolsCalled: toolsCalled,
|
|
23
|
+
additionalMetadata: turn.additionalMetadata,
|
|
24
|
+
};
|
|
25
|
+
}
|
|
26
|
+
function convertLLMTestCase(testCase) {
|
|
27
|
+
const toolsCalled = testCase.toolsCalled
|
|
28
|
+
? testCase.toolsCalled.map((tool) => ({
|
|
29
|
+
name: tool.name,
|
|
30
|
+
description: tool.description,
|
|
31
|
+
reasoning: tool.reasoning,
|
|
32
|
+
output: tool.output,
|
|
33
|
+
inputParameters: tool.inputParameters,
|
|
34
|
+
}))
|
|
35
|
+
: undefined;
|
|
36
|
+
const expectedTools = testCase.expectedTools
|
|
37
|
+
? testCase.expectedTools.map((tool) => ({
|
|
38
|
+
name: tool.name,
|
|
39
|
+
description: tool.description,
|
|
40
|
+
reasoning: tool.reasoning,
|
|
41
|
+
output: tool.output,
|
|
42
|
+
inputParameters: tool.inputParameters,
|
|
43
|
+
}))
|
|
44
|
+
: undefined;
|
|
45
|
+
return {
|
|
46
|
+
input: testCase.input,
|
|
47
|
+
actualOutput: testCase.actualOutput,
|
|
48
|
+
expectedOutput: testCase.expectedOutput,
|
|
49
|
+
context: testCase.context,
|
|
50
|
+
retrievalContext: testCase.retrievalContext,
|
|
51
|
+
additionalMetadata: testCase.additionalMetadata,
|
|
52
|
+
comments: testCase.comments,
|
|
53
|
+
toolsCalled: toolsCalled,
|
|
54
|
+
expectedTools: expectedTools,
|
|
55
|
+
reasoning: testCase.reasoning,
|
|
56
|
+
tokenCost: testCase.tokenCost,
|
|
57
|
+
completionTime: testCase.completionTime,
|
|
58
|
+
name: testCase.name,
|
|
59
|
+
};
|
|
60
|
+
}
|
|
61
|
+
function convertConversationalTestCase(testCase) {
|
|
62
|
+
const turns = testCase.turns.map(convertTurn);
|
|
63
|
+
return {
|
|
64
|
+
turns: turns,
|
|
65
|
+
scenario: testCase.scenario || undefined,
|
|
66
|
+
expectedOutcome: testCase.expectedOutcome || undefined,
|
|
67
|
+
userDescription: testCase.userDescription || undefined,
|
|
68
|
+
chatbotRole: testCase.chatbotRole || undefined,
|
|
69
|
+
};
|
|
70
|
+
}
|
|
71
|
+
async function processHyperparameters(hyperparameters) {
|
|
72
|
+
const processed = {};
|
|
73
|
+
for (const [key, value] of Object.entries(hyperparameters)) {
|
|
74
|
+
if (value instanceof prompt_1.Prompt) {
|
|
75
|
+
try {
|
|
76
|
+
if (!value.hash || value.hash === "latest" || !value.type) {
|
|
77
|
+
await value.push();
|
|
78
|
+
}
|
|
79
|
+
processed[key] = {
|
|
80
|
+
id: value.hash,
|
|
81
|
+
type: value.type || (value.textTemplate !== null ? "TEXT" : "LIST"),
|
|
82
|
+
};
|
|
83
|
+
}
|
|
84
|
+
catch (e) {
|
|
85
|
+
console.warn(`Failed to process prompt hyperparameter '${key}':`, e);
|
|
86
|
+
processed[key] = "Error processing prompt";
|
|
87
|
+
}
|
|
88
|
+
}
|
|
89
|
+
else {
|
|
90
|
+
processed[key] = String(value);
|
|
91
|
+
}
|
|
92
|
+
}
|
|
93
|
+
return processed;
|
|
94
|
+
}
|
|
95
|
+
async function evaluate(params) {
|
|
96
|
+
const { metricCollection, llmTestCases, conversationalTestCases, hyperparameters, identifier, } = params;
|
|
97
|
+
/////////////////////////////////////////////////////////
|
|
98
|
+
/// Type Checking
|
|
99
|
+
/////////////////////////////////////////////////////////
|
|
100
|
+
if ((llmTestCases?.length ?? 0) === 0 &&
|
|
101
|
+
(conversationalTestCases?.length ?? 0) === 0) {
|
|
102
|
+
throw new Error("You must provide either a non-empty array of 'llmTestCases' or 'conversationalTestCases'");
|
|
103
|
+
}
|
|
104
|
+
const testCaseLength = (llmTestCases?.length ?? 0) + (conversationalTestCases?.length ?? 0);
|
|
105
|
+
////////////////////////////////////////////////////////
|
|
106
|
+
/// Posting Data
|
|
107
|
+
/////////////////////////////////////////////////////////
|
|
108
|
+
if ((0, utils_1.isConfident)()) {
|
|
109
|
+
console.log(`Sending ${testCaseLength} test case(s) to Confident AI...`);
|
|
110
|
+
const startTime = performance.now();
|
|
111
|
+
try {
|
|
112
|
+
const api = new api_1.Api(undefined, api_1.API_BASE_URL);
|
|
113
|
+
let processedHyperparameters;
|
|
114
|
+
if (hyperparameters) {
|
|
115
|
+
processedHyperparameters = await processHyperparameters(hyperparameters);
|
|
116
|
+
}
|
|
117
|
+
let confidentRequestData;
|
|
118
|
+
if (llmTestCases) {
|
|
119
|
+
const convertedTestCases = llmTestCases.map(convertLLMTestCase);
|
|
120
|
+
confidentRequestData = {
|
|
121
|
+
metricCollection,
|
|
122
|
+
llmTestCases: convertedTestCases,
|
|
123
|
+
hyperparameters: processedHyperparameters,
|
|
124
|
+
identifier,
|
|
125
|
+
};
|
|
126
|
+
}
|
|
127
|
+
else if (conversationalTestCases) {
|
|
128
|
+
const convertedTestCases = conversationalTestCases.map(convertConversationalTestCase);
|
|
129
|
+
confidentRequestData = {
|
|
130
|
+
metricCollection,
|
|
131
|
+
conversationalTestCases: convertedTestCases,
|
|
132
|
+
hyperparameters: processedHyperparameters,
|
|
133
|
+
identifier,
|
|
134
|
+
};
|
|
135
|
+
}
|
|
136
|
+
else {
|
|
137
|
+
throw new Error("You must provide either a non-empty array of 'llmTestCases' or 'conversationalTestCases'");
|
|
138
|
+
}
|
|
139
|
+
const result = await api.sendRequest(api_1.HttpMethods.POST, api_1.Endpoints.EVALUATE_ENDPOINT, confidentRequestData);
|
|
140
|
+
const endTime = performance.now();
|
|
141
|
+
const timeTaken = ((endTime - startTime) / 1000).toFixed(2);
|
|
142
|
+
if (result) {
|
|
143
|
+
const response = {
|
|
144
|
+
link: result.link,
|
|
145
|
+
};
|
|
146
|
+
console.log(`Done! (${timeTaken}s)`);
|
|
147
|
+
console.log(`✓ Evaluation of metric collection '${metricCollection}' started! View progress on ${response.link}`);
|
|
148
|
+
}
|
|
149
|
+
}
|
|
150
|
+
catch (error) {
|
|
151
|
+
const endTime = performance.now();
|
|
152
|
+
const timeTaken = ((endTime - startTime) / 1000).toFixed(2);
|
|
153
|
+
console.error(`Error! (${timeTaken}s)`);
|
|
154
|
+
throw error;
|
|
155
|
+
}
|
|
156
|
+
}
|
|
157
|
+
else {
|
|
158
|
+
throw new Error("To run evaluations on Confident AI, run `deepeval login`.");
|
|
159
|
+
}
|
|
160
|
+
}
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
|
|
3
|
+
if (k2 === undefined) k2 = k;
|
|
4
|
+
var desc = Object.getOwnPropertyDescriptor(m, k);
|
|
5
|
+
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
|
|
6
|
+
desc = { enumerable: true, get: function() { return m[k]; } };
|
|
7
|
+
}
|
|
8
|
+
Object.defineProperty(o, k2, desc);
|
|
9
|
+
}) : (function(o, m, k, k2) {
|
|
10
|
+
if (k2 === undefined) k2 = k;
|
|
11
|
+
o[k2] = m[k];
|
|
12
|
+
}));
|
|
13
|
+
var __exportStar = (this && this.__exportStar) || function(m, exports) {
|
|
14
|
+
for (var p in m) if (p !== "default" && !Object.prototype.hasOwnProperty.call(exports, p)) __createBinding(exports, m, p);
|
|
15
|
+
};
|
|
16
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
17
|
+
exports.evaluate = void 0;
|
|
18
|
+
/**
|
|
19
|
+
* Exports for the confident module
|
|
20
|
+
*/
|
|
21
|
+
__exportStar(require("./api"), exports);
|
|
22
|
+
__exportStar(require("./types"), exports);
|
|
23
|
+
var evaluate_1 = require("./evaluate");
|
|
24
|
+
Object.defineProperty(exports, "evaluate", { enumerable: true, get: function () { return evaluate_1.evaluate; } });
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
import { ConversationalTestCase, LLMTestCase } from "../test-case";
|
|
2
|
+
export interface ConfidentEvaluateRequestData {
|
|
3
|
+
metricCollection: string;
|
|
4
|
+
llmTestCases?: LLMTestCase[];
|
|
5
|
+
conversationalTestCases?: ConversationalTestCase[];
|
|
6
|
+
hyperparameters?: {
|
|
7
|
+
[key: string]: string;
|
|
8
|
+
};
|
|
9
|
+
identifier?: string;
|
|
10
|
+
}
|
|
11
|
+
export interface ConfidentEvaluateResponseData {
|
|
12
|
+
link: string;
|
|
13
|
+
}
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
import { Environment } from "../tracing/utils";
|
|
2
|
+
export interface Settings {
|
|
3
|
+
CONFIDENT_TRACE_ENVIRONMENT?: Environment;
|
|
4
|
+
CONFIDENT_TRACE_VERBOSE?: boolean;
|
|
5
|
+
CONFIDENT_TRACE_SAMPLE_RATE?: number;
|
|
6
|
+
CONFIDENT_OTEL_URL?: string;
|
|
7
|
+
}
|
|
8
|
+
export declare function getSettings(): Settings;
|
|
9
|
+
export declare function resetSettings({ reloadDotenv, }?: {
|
|
10
|
+
reloadDotenv?: boolean;
|
|
11
|
+
}): Settings;
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
exports.getSettings = getSettings;
|
|
4
|
+
exports.resetSettings = resetSettings;
|
|
5
|
+
const utils_1 = require("../tracing/utils");
|
|
6
|
+
let _settings_singleton = null;
|
|
7
|
+
function getSettings() {
|
|
8
|
+
if (_settings_singleton === null) {
|
|
9
|
+
_settings_singleton = {
|
|
10
|
+
CONFIDENT_TRACE_ENVIRONMENT: process.env.CONFIDENT_TRACE_ENVIRONMENT ||
|
|
11
|
+
utils_1.Environment.DEVELOPMENT,
|
|
12
|
+
CONFIDENT_TRACE_VERBOSE: process.env.CONFIDENT_TRACE_VERBOSE !== undefined
|
|
13
|
+
? ["yes", "true", "1"].includes(process.env.CONFIDENT_TRACE_VERBOSE.toLowerCase())
|
|
14
|
+
: true,
|
|
15
|
+
CONFIDENT_TRACE_SAMPLE_RATE: process.env.CONFIDENT_TRACE_SAMPLE_RATE !== undefined
|
|
16
|
+
? parseFloat(process.env.CONFIDENT_TRACE_SAMPLE_RATE)
|
|
17
|
+
: 1.0,
|
|
18
|
+
CONFIDENT_OTEL_URL: process.env.CONFIDENT_OTEL_URL ||
|
|
19
|
+
"https://otel.confident-ai.com",
|
|
20
|
+
};
|
|
21
|
+
}
|
|
22
|
+
return _settings_singleton;
|
|
23
|
+
}
|
|
24
|
+
function resetSettings({ reloadDotenv = false, } = {}) {
|
|
25
|
+
if (reloadDotenv) {
|
|
26
|
+
// TODO
|
|
27
|
+
}
|
|
28
|
+
_settings_singleton = null;
|
|
29
|
+
return getSettings();
|
|
30
|
+
}
|
|
@@ -0,0 +1,4 @@
|
|
|
1
|
+
export declare const CONFIDENT_TRACE_VERBOSE = "CONFIDENT_TRACE_VERBOSE";
|
|
2
|
+
export declare const CONFIDENT_TRACE_SAMPLE_RATE = "CONFIDENT_TRACE_SAMPLE_RATE";
|
|
3
|
+
export declare const CONFIDENT_TRACE_ENVIRONMENT = "CONFIDENT_TRACE_ENVIRONMENT";
|
|
4
|
+
export declare const CONFIDENT_TRACING_ENABLED = "CONFIDENT_TRACING_ENABLED";
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
exports.CONFIDENT_TRACING_ENABLED = exports.CONFIDENT_TRACE_ENVIRONMENT = exports.CONFIDENT_TRACE_SAMPLE_RATE = exports.CONFIDENT_TRACE_VERBOSE = void 0;
|
|
4
|
+
exports.CONFIDENT_TRACE_VERBOSE = "CONFIDENT_TRACE_VERBOSE";
|
|
5
|
+
exports.CONFIDENT_TRACE_SAMPLE_RATE = "CONFIDENT_TRACE_SAMPLE_RATE";
|
|
6
|
+
exports.CONFIDENT_TRACE_ENVIRONMENT = "CONFIDENT_TRACE_ENVIRONMENT";
|
|
7
|
+
exports.CONFIDENT_TRACING_ENABLED = "CONFIDENT_TRACING_ENABLED";
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
import { ConversationalGolden, Golden } from "./golden";
|
|
2
|
+
export interface APIDataset {
|
|
3
|
+
alias: string;
|
|
4
|
+
overwrite?: boolean;
|
|
5
|
+
goldens?: Golden[];
|
|
6
|
+
conversationalGoldens?: any[];
|
|
7
|
+
}
|
|
8
|
+
export interface CreateDatasetHttpResponse {
|
|
9
|
+
link: string;
|
|
10
|
+
}
|
|
11
|
+
export interface DatasetHttpResponse {
|
|
12
|
+
goldens: Golden[];
|
|
13
|
+
conversationalGoldens: ConversationalGolden[];
|
|
14
|
+
id: string;
|
|
15
|
+
}
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
import { ConversationalGolden, Golden } from "./golden";
|
|
2
|
+
import { ConversationalTestCase, LLMTestCase } from "../test-case";
|
|
3
|
+
export type GoldenUnion = Golden | ConversationalGolden;
|
|
4
|
+
export type GoldenUnionArray = Golden[] | ConversationalGolden[];
|
|
5
|
+
export type TestCaseUnion = LLMTestCase | ConversationalTestCase;
|
|
6
|
+
export type TestCaseUnionArray = LLMTestCase[] | ConversationalTestCase[];
|
|
7
|
+
export declare class EvaluationDataset {
|
|
8
|
+
private _multiTurn;
|
|
9
|
+
private _alias;
|
|
10
|
+
private _id;
|
|
11
|
+
private _goldens;
|
|
12
|
+
private _conversationalGoldens;
|
|
13
|
+
private _llmTestCases;
|
|
14
|
+
private _conversationalTestCases;
|
|
15
|
+
constructor(params?: {
|
|
16
|
+
goldens?: GoldenUnionArray;
|
|
17
|
+
});
|
|
18
|
+
toString(): string;
|
|
19
|
+
get goldens(): GoldenUnionArray;
|
|
20
|
+
set goldens(goldens: GoldenUnionArray);
|
|
21
|
+
addGolden(golden: GoldenUnion): void;
|
|
22
|
+
private _addGolden;
|
|
23
|
+
private _addConversationalGolden;
|
|
24
|
+
get testCases(): TestCaseUnionArray;
|
|
25
|
+
set testCases(testCases: TestCaseUnionArray);
|
|
26
|
+
addTestCase(testCase: TestCaseUnion): void;
|
|
27
|
+
pull(params: {
|
|
28
|
+
alias: string;
|
|
29
|
+
finalized?: boolean;
|
|
30
|
+
autoConvertGoldensToTestCases?: boolean;
|
|
31
|
+
}): Promise<void>;
|
|
32
|
+
push(params: {
|
|
33
|
+
alias: string;
|
|
34
|
+
overwrite?: boolean;
|
|
35
|
+
}): Promise<void>;
|
|
36
|
+
queue(params: {
|
|
37
|
+
alias: string;
|
|
38
|
+
goldens: Array<Golden | ConversationalGolden>;
|
|
39
|
+
printResponse?: boolean;
|
|
40
|
+
}): Promise<void>;
|
|
41
|
+
addTestCasesFromCSV({ filePath, inputCol, actualOutputCol, expectedOutputCol, contextCol, contextDelimiter, retrievalContextCol, retrievalContextDelimiter, toolsCalledCol, expectedToolsCol, additionalMetadataCol, }: {
|
|
42
|
+
filePath: string;
|
|
43
|
+
inputCol: string;
|
|
44
|
+
actualOutputCol: string;
|
|
45
|
+
expectedOutputCol?: string;
|
|
46
|
+
contextCol?: string;
|
|
47
|
+
contextDelimiter?: string;
|
|
48
|
+
retrievalContextCol?: string;
|
|
49
|
+
retrievalContextDelimiter?: string;
|
|
50
|
+
toolsCalledCol?: string;
|
|
51
|
+
expectedToolsCol?: string;
|
|
52
|
+
additionalMetadataCol?: string;
|
|
53
|
+
}): Promise<any>;
|
|
54
|
+
}
|
|
@@ -0,0 +1,289 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __importDefault = (this && this.__importDefault) || function (mod) {
|
|
3
|
+
return (mod && mod.__esModule) ? mod : { "default": mod };
|
|
4
|
+
};
|
|
5
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
6
|
+
exports.EvaluationDataset = void 0;
|
|
7
|
+
const node_fs_1 = __importDefault(require("node:fs"));
|
|
8
|
+
const papaparse_1 = __importDefault(require("papaparse"));
|
|
9
|
+
const utils_1 = require("./utils");
|
|
10
|
+
const utils_2 = require("../utils");
|
|
11
|
+
const api_1 = require("../confident/api");
|
|
12
|
+
const golden_1 = require("./golden");
|
|
13
|
+
const test_case_1 = require("../test-case");
|
|
14
|
+
class EvaluationDataset {
|
|
15
|
+
_multiTurn = null;
|
|
16
|
+
_alias = null;
|
|
17
|
+
_id = null;
|
|
18
|
+
_goldens = [];
|
|
19
|
+
_conversationalGoldens = [];
|
|
20
|
+
_llmTestCases = [];
|
|
21
|
+
_conversationalTestCases = [];
|
|
22
|
+
constructor(params = {}) {
|
|
23
|
+
this._alias = null;
|
|
24
|
+
this._id = null;
|
|
25
|
+
const goldens = params.goldens ?? [];
|
|
26
|
+
if (goldens.length > 0) {
|
|
27
|
+
this._multiTurn = goldens[0] instanceof golden_1.ConversationalGolden;
|
|
28
|
+
}
|
|
29
|
+
this._goldens = [];
|
|
30
|
+
this._conversationalGoldens = [];
|
|
31
|
+
for (const golden of goldens) {
|
|
32
|
+
golden._datasetRank = goldens.length;
|
|
33
|
+
if (this._multiTurn) {
|
|
34
|
+
this._addConversationalGolden(golden);
|
|
35
|
+
}
|
|
36
|
+
else {
|
|
37
|
+
this._addGolden(golden);
|
|
38
|
+
}
|
|
39
|
+
}
|
|
40
|
+
this._llmTestCases = [];
|
|
41
|
+
this._conversationalTestCases = [];
|
|
42
|
+
}
|
|
43
|
+
toString() {
|
|
44
|
+
return `${this.constructor.name}(test_cases=${JSON.stringify(this.testCases)}, goldens=${JSON.stringify(this.goldens)}, _alias=${this._alias}, _id=${this._id}, _multi_turn=${this._multiTurn})`;
|
|
45
|
+
}
|
|
46
|
+
////////////////////////////////////////////////////////
|
|
47
|
+
// Golden Properties
|
|
48
|
+
////////////////////////////////////////////////////////
|
|
49
|
+
get goldens() {
|
|
50
|
+
return this._multiTurn ? this._conversationalGoldens : this._goldens;
|
|
51
|
+
}
|
|
52
|
+
set goldens(goldens) {
|
|
53
|
+
const prevGoldens = this._goldens;
|
|
54
|
+
const prevConvGoldens = this._conversationalGoldens;
|
|
55
|
+
this._goldens = [];
|
|
56
|
+
this._conversationalGoldens = [];
|
|
57
|
+
try {
|
|
58
|
+
for (const golden of goldens) {
|
|
59
|
+
if (!(golden instanceof golden_1.Golden) &&
|
|
60
|
+
!(golden instanceof golden_1.ConversationalGolden)) {
|
|
61
|
+
throw new TypeError("Your goldens must be instances of either ConversationalGolden or Golden");
|
|
62
|
+
}
|
|
63
|
+
golden._datasetAlias = this._alias ?? undefined;
|
|
64
|
+
golden._datasetId = this._id ?? undefined;
|
|
65
|
+
golden._datasetRank = goldens.length;
|
|
66
|
+
if (this._multiTurn) {
|
|
67
|
+
this._addConversationalGolden(golden);
|
|
68
|
+
}
|
|
69
|
+
else {
|
|
70
|
+
this.addGolden(golden);
|
|
71
|
+
}
|
|
72
|
+
}
|
|
73
|
+
}
|
|
74
|
+
catch (e) {
|
|
75
|
+
this._goldens = prevGoldens;
|
|
76
|
+
this._conversationalGoldens = prevConvGoldens;
|
|
77
|
+
throw e;
|
|
78
|
+
}
|
|
79
|
+
}
|
|
80
|
+
addGolden(golden) {
|
|
81
|
+
if (this._multiTurn === null) {
|
|
82
|
+
this._multiTurn = golden instanceof golden_1.ConversationalGolden;
|
|
83
|
+
}
|
|
84
|
+
if (this._multiTurn) {
|
|
85
|
+
this._addConversationalGolden(golden);
|
|
86
|
+
}
|
|
87
|
+
else {
|
|
88
|
+
this._addGolden(golden);
|
|
89
|
+
}
|
|
90
|
+
}
|
|
91
|
+
_addGolden(golden) {
|
|
92
|
+
if (golden instanceof golden_1.Golden) {
|
|
93
|
+
this._goldens.push(golden);
|
|
94
|
+
}
|
|
95
|
+
else {
|
|
96
|
+
throw new TypeError("You cannot add a multi-turn ConversationalGolden to a single-turn dataset. You can only add a Golden.");
|
|
97
|
+
}
|
|
98
|
+
}
|
|
99
|
+
_addConversationalGolden(golden) {
|
|
100
|
+
if (golden instanceof golden_1.ConversationalGolden) {
|
|
101
|
+
this._conversationalGoldens.push(golden);
|
|
102
|
+
}
|
|
103
|
+
else {
|
|
104
|
+
throw new TypeError("You cannot add a single-turn Golden to a multi-turn dataset. You can only add a ConversationalGolden.");
|
|
105
|
+
}
|
|
106
|
+
}
|
|
107
|
+
////////////////////////////////////////////////////////
|
|
108
|
+
// Test Case Properties
|
|
109
|
+
////////////////////////////////////////////////////////
|
|
110
|
+
get testCases() {
|
|
111
|
+
return this._multiTurn ? this._conversationalTestCases : this._llmTestCases;
|
|
112
|
+
}
|
|
113
|
+
set testCases(testCases) {
|
|
114
|
+
const llmTestCases = [];
|
|
115
|
+
const conversationalTestCases = [];
|
|
116
|
+
for (const testCase of testCases) {
|
|
117
|
+
if (!(testCase instanceof test_case_1.LLMTestCase) &&
|
|
118
|
+
!(testCase instanceof test_case_1.ConversationalTestCase)) {
|
|
119
|
+
continue;
|
|
120
|
+
}
|
|
121
|
+
testCase._datasetAlias = this._alias ?? undefined;
|
|
122
|
+
testCase._datasetId = this._id ?? undefined;
|
|
123
|
+
if (testCase instanceof test_case_1.LLMTestCase) {
|
|
124
|
+
testCase._datasetRank = llmTestCases.length;
|
|
125
|
+
llmTestCases.push(testCase);
|
|
126
|
+
}
|
|
127
|
+
else if (testCase instanceof test_case_1.ConversationalTestCase) {
|
|
128
|
+
testCase._datasetRank = conversationalTestCases.length;
|
|
129
|
+
conversationalTestCases.push(testCase);
|
|
130
|
+
}
|
|
131
|
+
}
|
|
132
|
+
this._llmTestCases = llmTestCases;
|
|
133
|
+
this._conversationalTestCases = conversationalTestCases;
|
|
134
|
+
}
|
|
135
|
+
addTestCase(testCase) {
|
|
136
|
+
testCase._datasetAlias = this._alias ?? undefined;
|
|
137
|
+
testCase._datasetId = this._id ?? undefined;
|
|
138
|
+
if (testCase instanceof test_case_1.LLMTestCase) {
|
|
139
|
+
testCase._datasetRank = this._llmTestCases.length;
|
|
140
|
+
this._llmTestCases.push(testCase);
|
|
141
|
+
}
|
|
142
|
+
else if (testCase instanceof test_case_1.ConversationalTestCase) {
|
|
143
|
+
testCase._datasetRank = this._conversationalTestCases.length;
|
|
144
|
+
this._conversationalTestCases.push(testCase);
|
|
145
|
+
}
|
|
146
|
+
}
|
|
147
|
+
////////////////////////////////////////////////////////
|
|
148
|
+
// Push and Pull Methods
|
|
149
|
+
////////////////////////////////////////////////////////
|
|
150
|
+
async pull(params) {
|
|
151
|
+
const { alias, finalized = true, autoConvertGoldensToTestCases = false, } = params;
|
|
152
|
+
if (!(0, utils_2.isConfident)()) {
|
|
153
|
+
throw new Error("Set CONFIDENT_API_KEY to pull dataset.");
|
|
154
|
+
}
|
|
155
|
+
console.log(`Pulling '${alias}' from Confident AI...`);
|
|
156
|
+
const api = new api_1.Api();
|
|
157
|
+
const startTime = performance.now();
|
|
158
|
+
const result = await api.sendRequest(api_1.HttpMethods.GET, api_1.Endpoints.DATASET_ENDPOINT, undefined, { alias, finalized: finalized.toString().toLowerCase() });
|
|
159
|
+
const response = {
|
|
160
|
+
goldens: result.goldens
|
|
161
|
+
? result.goldens.map((goldenData) => new golden_1.Golden({
|
|
162
|
+
input: goldenData.input,
|
|
163
|
+
actualOutput: goldenData.actualOutput,
|
|
164
|
+
expectedOutput: goldenData.expectedOutput,
|
|
165
|
+
context: goldenData.context,
|
|
166
|
+
retrievalContext: goldenData.retrievalContext,
|
|
167
|
+
toolsCalled: goldenData.toolsCalled,
|
|
168
|
+
expectedTools: goldenData.expectedTools,
|
|
169
|
+
additionalMetadata: goldenData.additionalMetadata,
|
|
170
|
+
sourceFile: goldenData.sourceFile,
|
|
171
|
+
comments: goldenData.comments,
|
|
172
|
+
}))
|
|
173
|
+
: undefined,
|
|
174
|
+
conversationalGoldens: result.conversationalGoldens
|
|
175
|
+
? result.conversationalGoldens.map((goldenData) => new golden_1.ConversationalGolden({
|
|
176
|
+
scenario: goldenData.scenario,
|
|
177
|
+
expectedOutcome: goldenData.expectedOutcome,
|
|
178
|
+
userDescription: goldenData.userDescription,
|
|
179
|
+
context: goldenData.context,
|
|
180
|
+
additionalMetadata: goldenData.additionalMetadata,
|
|
181
|
+
comments: goldenData.comments,
|
|
182
|
+
name: goldenData.name,
|
|
183
|
+
customColumnKeyValues: goldenData.customColumnKeyValues,
|
|
184
|
+
turns: goldenData.turns,
|
|
185
|
+
_datasetRank: goldenData._datasetRank,
|
|
186
|
+
_datasetAlias: goldenData._datasetAlias,
|
|
187
|
+
_datasetId: goldenData._datasetId,
|
|
188
|
+
}))
|
|
189
|
+
: undefined,
|
|
190
|
+
id: result.datasetId,
|
|
191
|
+
};
|
|
192
|
+
this._alias = alias;
|
|
193
|
+
this._id = response.id;
|
|
194
|
+
this._multiTurn = result.goldens === undefined;
|
|
195
|
+
this.goldens = [];
|
|
196
|
+
this.testCases = [];
|
|
197
|
+
if (autoConvertGoldensToTestCases) {
|
|
198
|
+
if (!this._multiTurn) {
|
|
199
|
+
const llmTestCases = (0, utils_1.convertGoldensToTestCases)(response.goldens, alias, response.id);
|
|
200
|
+
this._llmTestCases.push(...llmTestCases);
|
|
201
|
+
}
|
|
202
|
+
else {
|
|
203
|
+
const conversationalTestCases = (0, utils_1.convertConvoGoldensToConvoTestCases)(response.conversationalGoldens, alias, response.id);
|
|
204
|
+
this._conversationalTestCases.push(...conversationalTestCases);
|
|
205
|
+
}
|
|
206
|
+
}
|
|
207
|
+
else {
|
|
208
|
+
if (!this._multiTurn) {
|
|
209
|
+
this.goldens = response.goldens;
|
|
210
|
+
}
|
|
211
|
+
else {
|
|
212
|
+
this.goldens = response.conversationalGoldens;
|
|
213
|
+
for (const golden of this.goldens) {
|
|
214
|
+
golden._datasetAlias = alias;
|
|
215
|
+
golden._datasetId = response.id;
|
|
216
|
+
}
|
|
217
|
+
}
|
|
218
|
+
}
|
|
219
|
+
const endTime = performance.now();
|
|
220
|
+
const timeTaken = ((endTime - startTime) / 1000).toFixed(2);
|
|
221
|
+
console.log(`Done! (${timeTaken}s)`);
|
|
222
|
+
}
|
|
223
|
+
async push(params) {
|
|
224
|
+
const { alias, overwrite = false } = params;
|
|
225
|
+
if (this.goldens.length === 0) {
|
|
226
|
+
throw new Error("Unable to push empty dataset to Confident AI, there must be at least one golden in dataset.");
|
|
227
|
+
}
|
|
228
|
+
const api = new api_1.Api();
|
|
229
|
+
const apiDataset = {
|
|
230
|
+
alias,
|
|
231
|
+
overwrite,
|
|
232
|
+
goldens: !this._multiTurn ? this.goldens : undefined,
|
|
233
|
+
conversationalGoldens: this._multiTurn ? this.goldens : undefined,
|
|
234
|
+
};
|
|
235
|
+
const body = (0, utils_1.stripPrivateFields)(JSON.parse(JSON.stringify(apiDataset)));
|
|
236
|
+
console.log(`Pushing '${alias}' to Confident AI...`);
|
|
237
|
+
const result = await api.sendRequest(api_1.HttpMethods.POST, api_1.Endpoints.DATASET_ENDPOINT, body);
|
|
238
|
+
const link = result?.link;
|
|
239
|
+
if (link) {
|
|
240
|
+
console.log(`✅ Dataset successfully pushed to Confident AI! View at: ${link}`);
|
|
241
|
+
}
|
|
242
|
+
}
|
|
243
|
+
////////////////////////////////////////////////////////
|
|
244
|
+
// Queue Methods
|
|
245
|
+
////////////////////////////////////////////////////////
|
|
246
|
+
async queue(params) {
|
|
247
|
+
const { alias, goldens, printResponse = true } = params;
|
|
248
|
+
if (!goldens || goldens.length === 0) {
|
|
249
|
+
throw new Error(`Can't queue empty list of goldens to dataset with alias: ${alias} on Confident AI.`);
|
|
250
|
+
}
|
|
251
|
+
const api = new api_1.Api();
|
|
252
|
+
const isMultiTurn = goldens[0] instanceof golden_1.ConversationalGolden;
|
|
253
|
+
const apiDataset = {
|
|
254
|
+
alias,
|
|
255
|
+
goldens: !isMultiTurn ? goldens : undefined,
|
|
256
|
+
conversationalGoldens: isMultiTurn ? goldens : undefined,
|
|
257
|
+
};
|
|
258
|
+
const body = (0, utils_1.stripPrivateFields)(apiDataset);
|
|
259
|
+
console.log(`Queueing ${goldens.length} golden(s) to '${alias}' on Confident AI...`);
|
|
260
|
+
const result = await api.sendRequest(api_1.HttpMethods.POST, api_1.Endpoints.DATASET_ENDPOINT, body, undefined, `/v1/datasets/${alias}/queue`);
|
|
261
|
+
const link = result?.link;
|
|
262
|
+
if (link && printResponse) {
|
|
263
|
+
console.log(`✅ Goldens successfully queued to Confident AI! Annotate & finalize at: ${link}`);
|
|
264
|
+
}
|
|
265
|
+
}
|
|
266
|
+
async addTestCasesFromCSV({ filePath, inputCol, actualOutputCol, expectedOutputCol, contextCol, contextDelimiter = ";", retrievalContextCol, retrievalContextDelimiter = ";", toolsCalledCol, expectedToolsCol, additionalMetadataCol, }) {
|
|
267
|
+
const csvData = node_fs_1.default.readFileSync(filePath, "utf8");
|
|
268
|
+
const { data, errors } = papaparse_1.default.parse(csvData, {
|
|
269
|
+
header: true,
|
|
270
|
+
skipEmptyLines: true,
|
|
271
|
+
});
|
|
272
|
+
if (errors.length) {
|
|
273
|
+
throw new Error(`CSV parse error: ${errors[0].message}`);
|
|
274
|
+
}
|
|
275
|
+
return data.map((row) => new test_case_1.LLMTestCase({
|
|
276
|
+
input: row[inputCol],
|
|
277
|
+
actualOutput: row[actualOutputCol],
|
|
278
|
+
expectedOutput: expectedOutputCol
|
|
279
|
+
? row[expectedOutputCol]
|
|
280
|
+
: undefined,
|
|
281
|
+
context: (0, utils_1.parseDelimited)(row[contextCol], contextDelimiter),
|
|
282
|
+
retrievalContext: (0, utils_1.parseDelimited)(row[retrievalContextCol], retrievalContextDelimiter),
|
|
283
|
+
toolsCalled: (0, utils_1.safeJsonParse)(row[toolsCalledCol], []),
|
|
284
|
+
expectedTools: (0, utils_1.safeJsonParse)(row[expectedToolsCol], []),
|
|
285
|
+
additionalMetadata: (0, utils_1.safeJsonParse)(row[additionalMetadataCol], undefined),
|
|
286
|
+
}));
|
|
287
|
+
}
|
|
288
|
+
}
|
|
289
|
+
exports.EvaluationDataset = EvaluationDataset;
|