llmtester 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +156 -0
- package/bin/cli.js +2 -0
- package/bin/tui.js +2 -0
- package/dist/benchmarks.d.ts +17 -0
- package/dist/benchmarks.d.ts.map +1 -0
- package/dist/benchmarks.js +612 -0
- package/dist/benchmarks.js.map +1 -0
- package/dist/client.d.ts +69 -0
- package/dist/client.d.ts.map +1 -0
- package/dist/client.js +103 -0
- package/dist/client.js.map +1 -0
- package/dist/evaluator.d.ts +57 -0
- package/dist/evaluator.d.ts.map +1 -0
- package/dist/evaluator.js +410 -0
- package/dist/evaluator.js.map +1 -0
- package/dist/index.d.ts +3 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +515 -0
- package/dist/index.js.map +1 -0
- package/dist/logger.d.ts +16 -0
- package/dist/logger.d.ts.map +1 -0
- package/dist/logger.js +30 -0
- package/dist/logger.js.map +1 -0
- package/dist/paths.d.ts +6 -0
- package/dist/paths.d.ts.map +1 -0
- package/dist/paths.js +49 -0
- package/dist/paths.js.map +1 -0
- package/dist/progress.d.ts +13 -0
- package/dist/progress.d.ts.map +1 -0
- package/dist/progress.js +47 -0
- package/dist/progress.js.map +1 -0
- package/dist/tui.d.ts +3 -0
- package/dist/tui.d.ts.map +1 -0
- package/dist/tui.js +326 -0
- package/dist/tui.js.map +1 -0
- package/package.json +45 -0
package/dist/client.d.ts
ADDED
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
export type ProviderType = 'openai' | 'anthropic' | 'custom';
|
|
2
|
+
export interface LLMClient {
|
|
3
|
+
provider: ProviderType;
|
|
4
|
+
baseUrl: string;
|
|
5
|
+
model: string;
|
|
6
|
+
chat(messages: {
|
|
7
|
+
role: string;
|
|
8
|
+
content: string;
|
|
9
|
+
}[], options?: {
|
|
10
|
+
temperature?: number;
|
|
11
|
+
maxTokens?: number;
|
|
12
|
+
}): Promise<{
|
|
13
|
+
content: string;
|
|
14
|
+
finishReason: string;
|
|
15
|
+
usage?: {
|
|
16
|
+
promptTokens: number;
|
|
17
|
+
completionTokens: number;
|
|
18
|
+
totalTokens: number;
|
|
19
|
+
};
|
|
20
|
+
}>;
|
|
21
|
+
}
|
|
22
|
+
export declare class OpenAICompatibleClient implements LLMClient {
|
|
23
|
+
provider: ProviderType;
|
|
24
|
+
baseUrl: string;
|
|
25
|
+
model: string;
|
|
26
|
+
private apiKey;
|
|
27
|
+
private httpClient;
|
|
28
|
+
constructor(apiKey: string, baseUrl: string, model: string);
|
|
29
|
+
chat(messages: {
|
|
30
|
+
role: string;
|
|
31
|
+
content: string;
|
|
32
|
+
}[], options?: {
|
|
33
|
+
temperature?: number;
|
|
34
|
+
maxTokens?: number;
|
|
35
|
+
}): Promise<{
|
|
36
|
+
content: string;
|
|
37
|
+
finishReason: string;
|
|
38
|
+
usage?: {
|
|
39
|
+
promptTokens: number;
|
|
40
|
+
completionTokens: number;
|
|
41
|
+
totalTokens: number;
|
|
42
|
+
};
|
|
43
|
+
}>;
|
|
44
|
+
}
|
|
45
|
+
export declare class AnthropicClient implements LLMClient {
|
|
46
|
+
provider: ProviderType;
|
|
47
|
+
baseUrl: string;
|
|
48
|
+
model: string;
|
|
49
|
+
private apiKey;
|
|
50
|
+
private httpClient;
|
|
51
|
+
constructor(apiKey: string, baseUrl: string, model: string);
|
|
52
|
+
chat(messages: {
|
|
53
|
+
role: string;
|
|
54
|
+
content: string;
|
|
55
|
+
}[], options?: {
|
|
56
|
+
temperature?: number;
|
|
57
|
+
maxTokens?: number;
|
|
58
|
+
}): Promise<{
|
|
59
|
+
content: string;
|
|
60
|
+
finishReason: string;
|
|
61
|
+
usage?: {
|
|
62
|
+
promptTokens: number;
|
|
63
|
+
completionTokens: number;
|
|
64
|
+
totalTokens: number;
|
|
65
|
+
};
|
|
66
|
+
}>;
|
|
67
|
+
}
|
|
68
|
+
export declare function createLLMClient(provider: ProviderType, apiKey: string, baseUrl: string, model: string): LLMClient;
|
|
69
|
+
//# sourceMappingURL=client.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"client.d.ts","sourceRoot":"","sources":["../src/client.ts"],"names":[],"mappings":"AAEA,MAAM,MAAM,YAAY,GAAG,QAAQ,GAAG,WAAW,GAAG,QAAQ,CAAC;AAE7D,MAAM,WAAW,SAAS;IACxB,QAAQ,EAAE,YAAY,CAAC;IACvB,OAAO,EAAE,MAAM,CAAC;IAChB,KAAK,EAAE,MAAM,CAAC;IACd,IAAI,CAAC,QAAQ,EAAE;QAAE,IAAI,EAAE,MAAM,CAAC;QAAC,OAAO,EAAE,MAAM,CAAA;KAAE,EAAE,EAAE,OAAO,CAAC,EAAE;QAC5D,WAAW,CAAC,EAAE,MAAM,CAAC;QACrB,SAAS,CAAC,EAAE,MAAM,CAAC;KACpB,GAAG,OAAO,CAAC;QACV,OAAO,EAAE,MAAM,CAAC;QAChB,YAAY,EAAE,MAAM,CAAC;QACrB,KAAK,CAAC,EAAE;YAAE,YAAY,EAAE,MAAM,CAAC;YAAC,gBAAgB,EAAE,MAAM,CAAC;YAAC,WAAW,EAAE,MAAM,CAAA;SAAE,CAAC;KACjF,CAAC,CAAC;CACJ;AAED,qBAAa,sBAAuB,YAAW,SAAS;IAC/C,QAAQ,EAAE,YAAY,CAAY;IAClC,OAAO,EAAE,MAAM,CAAC;IAChB,KAAK,EAAE,MAAM,CAAC;IACrB,OAAO,CAAC,MAAM,CAAS;IACvB,OAAO,CAAC,UAAU,CAAgB;gBAEtB,MAAM,EAAE,MAAM,EAAE,OAAO,EAAE,MAAM,EAAE,KAAK,EAAE,MAAM;IAcpD,IAAI,CAAC,QAAQ,EAAE;QAAE,IAAI,EAAE,MAAM,CAAC;QAAC,OAAO,EAAE,MAAM,CAAA;KAAE,EAAE,EAAE,OAAO,CAAC,EAAE;QAClE,WAAW,CAAC,EAAE,MAAM,CAAC;QACrB,SAAS,CAAC,EAAE,MAAM,CAAC;KACpB,GAAG,OAAO,CAAC;QACV,OAAO,EAAE,MAAM,CAAC;QAChB,YAAY,EAAE,MAAM,CAAC;QACrB,KAAK,CAAC,EAAE;YAAE,YAAY,EAAE,MAAM,CAAC;YAAC,gBAAgB,EAAE,MAAM,CAAC;YAAC,WAAW,EAAE,MAAM,CAAA;SAAE,CAAC;KACjF,CAAC;CAmBH;AAED,qBAAa,eAAgB,YAAW,SAAS;IACxC,QAAQ,EAAE,YAAY,CAAe;IACrC,OAAO,EAAE,MAAM,CAAC;IAChB,KAAK,EAAE,MAAM,CAAC;IACrB,OAAO,CAAC,MAAM,CAAS;IACvB,OAAO,CAAC,UAAU,CAAgB;gBAEtB,MAAM,EAAE,MAAM,EAAE,OAAO,EAAE,MAAM,EAAE,KAAK,EAAE,MAAM;IAgBpD,IAAI,CAAC,QAAQ,EAAE;QAAE,IAAI,EAAE,MAAM,CAAC;QAAC,OAAO,EAAE,MAAM,CAAA;KAAE,EAAE,EAAE,OAAO,CAAC,EAAE;QAClE,WAAW,CAAC,EAAE,MAAM,CAAC;QACrB,SAAS,CAAC,EAAE,MAAM,CAAC;KACpB,GAAG,OAAO,CAAC;QACV,OAAO,EAAE,MAAM,CAAC;QAChB,YAAY,EAAE,MAAM,CAAC;QACrB,KAAK,CAAC,EAAE;YAAE,YAAY,EAAE,MAAM,CAAC;YAAC,gBAAgB,EAAE,MAAM,CAAC;YAAC,WAAW,EAAE,MAAM,CAAA;SAAE,CAAC;KACjF,CAAC;CA8BH;AAED,wBAAgB,eAAe,CAC7B,QAAQ,EAAE,YAAY,EACtB,MAAM,EAAE,MAAM,EACd,OAAO,EAAE,MAAM,EACf,KAAK,EAAE,MAAM,GACZ,SAAS,CAKX"}
|
package/dist/client.js
ADDED
|
@@ -0,0 +1,103 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __importDefault = (this && this.__importDefault) || function (mod) {
|
|
3
|
+
return (mod && mod.__esModule) ? mod : { "default": mod };
|
|
4
|
+
};
|
|
5
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
6
|
+
exports.AnthropicClient = exports.OpenAICompatibleClient = void 0;
|
|
7
|
+
exports.createLLMClient = createLLMClient;
|
|
8
|
+
const axios_1 = __importDefault(require("axios"));
|
|
9
|
+
class OpenAICompatibleClient {
|
|
10
|
+
provider = 'openai';
|
|
11
|
+
baseUrl;
|
|
12
|
+
model;
|
|
13
|
+
apiKey;
|
|
14
|
+
httpClient;
|
|
15
|
+
constructor(apiKey, baseUrl, model) {
|
|
16
|
+
this.apiKey = apiKey;
|
|
17
|
+
this.baseUrl = baseUrl;
|
|
18
|
+
this.model = model;
|
|
19
|
+
this.httpClient = axios_1.default.create({
|
|
20
|
+
baseURL: baseUrl,
|
|
21
|
+
headers: {
|
|
22
|
+
'Authorization': `Bearer ${apiKey}`,
|
|
23
|
+
'Content-Type': 'application/json',
|
|
24
|
+
},
|
|
25
|
+
timeout: 120000,
|
|
26
|
+
});
|
|
27
|
+
}
|
|
28
|
+
async chat(messages, options) {
|
|
29
|
+
const response = await this.httpClient.post('/chat/completions', {
|
|
30
|
+
model: this.model,
|
|
31
|
+
messages,
|
|
32
|
+
temperature: options?.temperature ?? 0,
|
|
33
|
+
max_tokens: options?.maxTokens ?? 512,
|
|
34
|
+
});
|
|
35
|
+
const choice = response.data.choices?.[0];
|
|
36
|
+
return {
|
|
37
|
+
content: choice?.message?.content || '',
|
|
38
|
+
finishReason: choice?.finish_reason || '',
|
|
39
|
+
usage: response.data.usage ? {
|
|
40
|
+
promptTokens: response.data.usage.prompt_tokens || 0,
|
|
41
|
+
completionTokens: response.data.usage.completion_tokens || 0,
|
|
42
|
+
totalTokens: response.data.usage.total_tokens || 0,
|
|
43
|
+
} : undefined,
|
|
44
|
+
};
|
|
45
|
+
}
|
|
46
|
+
}
|
|
47
|
+
exports.OpenAICompatibleClient = OpenAICompatibleClient;
|
|
48
|
+
class AnthropicClient {
|
|
49
|
+
provider = 'anthropic';
|
|
50
|
+
baseUrl;
|
|
51
|
+
model;
|
|
52
|
+
apiKey;
|
|
53
|
+
httpClient;
|
|
54
|
+
constructor(apiKey, baseUrl, model) {
|
|
55
|
+
this.apiKey = apiKey;
|
|
56
|
+
this.baseUrl = baseUrl;
|
|
57
|
+
this.model = model;
|
|
58
|
+
const baseURL = baseUrl || 'https://api.anthropic.com';
|
|
59
|
+
this.httpClient = axios_1.default.create({
|
|
60
|
+
baseURL,
|
|
61
|
+
headers: {
|
|
62
|
+
'x-api-key': apiKey,
|
|
63
|
+
'Content-Type': 'application/json',
|
|
64
|
+
'anthropic-version': '2023-06-01',
|
|
65
|
+
},
|
|
66
|
+
timeout: 120000,
|
|
67
|
+
});
|
|
68
|
+
}
|
|
69
|
+
async chat(messages, options) {
|
|
70
|
+
const systemMessage = messages.find(m => m.role === 'system');
|
|
71
|
+
const chatMessages = messages.filter(m => m.role !== 'system');
|
|
72
|
+
const requestBody = {
|
|
73
|
+
model: this.model,
|
|
74
|
+
messages: chatMessages.map(m => ({
|
|
75
|
+
role: m.role === 'assistant' ? 'assistant' : 'user',
|
|
76
|
+
content: m.content,
|
|
77
|
+
})),
|
|
78
|
+
temperature: options?.temperature ?? 0,
|
|
79
|
+
max_tokens: options?.maxTokens ?? 1024,
|
|
80
|
+
};
|
|
81
|
+
if (systemMessage) {
|
|
82
|
+
requestBody.system = systemMessage.content;
|
|
83
|
+
}
|
|
84
|
+
const response = await this.httpClient.post('/v1/messages', requestBody);
|
|
85
|
+
return {
|
|
86
|
+
content: response.data.content?.[0]?.text || '',
|
|
87
|
+
finishReason: response.data.stop_reason || '',
|
|
88
|
+
usage: response.data.usage ? {
|
|
89
|
+
promptTokens: response.data.usage.input_tokens || 0,
|
|
90
|
+
completionTokens: response.data.usage.output_tokens || 0,
|
|
91
|
+
totalTokens: (response.data.usage.input_tokens || 0) + (response.data.usage.output_tokens || 0),
|
|
92
|
+
} : undefined,
|
|
93
|
+
};
|
|
94
|
+
}
|
|
95
|
+
}
|
|
96
|
+
exports.AnthropicClient = AnthropicClient;
|
|
97
|
+
function createLLMClient(provider, apiKey, baseUrl, model) {
|
|
98
|
+
if (provider === 'anthropic') {
|
|
99
|
+
return new AnthropicClient(apiKey, baseUrl, model);
|
|
100
|
+
}
|
|
101
|
+
return new OpenAICompatibleClient(apiKey, baseUrl, model);
|
|
102
|
+
}
|
|
103
|
+
//# sourceMappingURL=client.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"client.js","sourceRoot":"","sources":["../src/client.ts"],"names":[],"mappings":";;;;;;AAiIA,0CAUC;AA3ID,kDAA6C;AAkB7C,MAAa,sBAAsB;IAC1B,QAAQ,GAAiB,QAAQ,CAAC;IAClC,OAAO,CAAS;IAChB,KAAK,CAAS;IACb,MAAM,CAAS;IACf,UAAU,CAAgB;IAElC,YAAY,MAAc,EAAE,OAAe,EAAE,KAAa;QACxD,IAAI,CAAC,MAAM,GAAG,MAAM,CAAC;QACrB,IAAI,CAAC,OAAO,GAAG,OAAO,CAAC;QACvB,IAAI,CAAC,KAAK,GAAG,KAAK,CAAC;QACnB,IAAI,CAAC,UAAU,GAAG,eAAK,CAAC,MAAM,CAAC;YAC7B,OAAO,EAAE,OAAO;YAChB,OAAO,EAAE;gBACP,eAAe,EAAE,UAAU,MAAM,EAAE;gBACnC,cAAc,EAAE,kBAAkB;aACnC;YACD,OAAO,EAAE,MAAM;SAChB,CAAC,CAAC;IACL,CAAC;IAED,KAAK,CAAC,IAAI,CAAC,QAA6C,EAAE,OAGzD;QAKC,MAAM,QAAQ,GAAG,MAAM,IAAI,CAAC,UAAU,CAAC,IAAI,CAAC,mBAAmB,EAAE;YAC/D,KAAK,EAAE,IAAI,CAAC,KAAK;YACjB,QAAQ;YACR,WAAW,EAAE,OAAO,EAAE,WAAW,IAAI,CAAC;YACtC,UAAU,EAAE,OAAO,EAAE,SAAS,IAAI,GAAG;SACtC,CAAC,CAAC;QAEH,MAAM,MAAM,GAAG,QAAQ,CAAC,IAAI,CAAC,OAAO,EAAE,CAAC,CAAC,CAAC,CAAC;QAC1C,OAAO;YACL,OAAO,EAAE,MAAM,EAAE,OAAO,EAAE,OAAO,IAAI,EAAE;YACvC,YAAY,EAAE,MAAM,EAAE,aAAa,IAAI,EAAE;YACzC,KAAK,EAAE,QAAQ,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC;gBAC3B,YAAY,EAAE,QAAQ,CAAC,IAAI,CAAC,KAAK,CAAC,aAAa,IAAI,CAAC;gBACpD,gBAAgB,EAAE,QAAQ,CAAC,IAAI,CAAC,KAAK,CAAC,iBAAiB,IAAI,CAAC;gBAC5D,WAAW,EAAE,QAAQ,CAAC,IAAI,CAAC,KAAK,CAAC,YAAY,IAAI,CAAC;aACnD,CAAC,CAAC,CAAC,SAAS;SACd,CAAC;IACJ,CAAC;CACF;AA/CD,wDA+CC;AAED,MAAa,eAAe;IACnB,QAAQ,GAAiB,WAAW,CAAC;IACrC,OAAO,CAAS;IAChB,KAAK,CAAS;IACb,MAAM,CAAS;IACf,UAAU,CAAgB;IAElC,YAAY,MAAc,EAAE,OAAe,EAAE,KAAa;QACxD,IAAI,CAAC,MAAM,GAAG,MAAM,CAAC;QACrB,IAAI,CAAC,OAAO,GAAG,OAAO,CAAC;QACvB,IAAI,CAAC,KAAK,GAAG,KAAK,CAAC;QACnB,MAAM,OAAO,GAAG,OAAO,IAAI,2BAA2B,CAAC;QACvD,IAAI,CAAC,UAAU,GAAG,eAAK,CAAC,MAAM,CAAC;YAC7B,OAAO;YACP,OAAO,EAAE;gBACP,WAAW,EAAE,MAAM;gBACnB,cAAc,EAAE,kBAAkB;gBAClC,mBAAmB,EAAE,YAAY;aAClC;YACD,OAAO,EAAE,MAAM;SAChB,CAAC,CAAC;IACL,CAAC;IAED,KAAK,CAAC,IAAI,CAAC,QAA6C,EAAE,OAGzD;QAKC,MAAM,aAAa,GAAG,QAAQ,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,IAAI,KAAK,QAAQ,CAAC,CAAC;QAC9D,MAAM,YAAY,GAAG,QAAQ,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,IAAI,KAAK,QAAQ,CAAC,CAAC;QAE/D,MAAM,WAAW,GAAQ;YACvB,KAAK,EAAE,IAAI,CAAC,KAAK;YACjB,QAAQ,EAAE,YAAY,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC;gBAC/B,IAAI,EAAE,CAAC,CAAC,IAAI,KAAK,WAAW,CAAC,CAAC,CAAC,WAAW,CAAC,CAAC,CAAC,MAAM;gBACnD,OAAO,EAAE,CAAC,CAAC,OAAO;aACnB,CAAC,CAAC;YACH,WAAW,EAAE,OAAO,EAAE,WAAW,IAAI,CAAC;YACtC,UAAU,EAAE,OAAO,EAAE,SAAS,IAAI,IAAI;SACvC,CAAC;QAEF,IAAI,aAAa,EAAE,CAAC;YAClB,WAAW,CAAC,MAAM,GAAG,aAAa,CAAC,OAAO,CAAC;QAC7C,CAAC;QAED,MAAM,QAAQ,GAAG,MAAM,IAAI,CAAC,UAAU,CAAC,IAAI,CAAC,cAAc,EAAE,WAAW,CAAC,CAAC;QAEzE,OAAO;YACL,OAAO,EAAE,QAAQ,CAAC,IAAI,CAAC,OAAO,EAAE,CAAC,CAAC,CAAC,EAAE,IAAI,IAAI,EAAE;YAC/C,YAAY,EAAE,QAAQ,CAAC,IAAI,CAAC,WAAW,IAAI,EAAE;YAC7C,KAAK,EAAE,QAAQ,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC;gBAC3B,YAAY,EAAE,QAAQ,CAAC,IAAI,CAAC,KAAK,CAAC,YAAY,IAAI,CAAC;gBACnD,gBAAgB,EAAE,QAAQ,CAAC,IAAI,CAAC,KAAK,CAAC,aAAa,IAAI,CAAC;gBACxD,WAAW,EAAE,CAAC,QAAQ,CAAC,IAAI,CAAC,KAAK,CAAC,YAAY,IAAI,CAAC,CAAC,GAAG,CAAC,QAAQ,CAAC,IAAI,CAAC,KAAK,CAAC,aAAa,IAAI,CAAC,CAAC;aAChG,CAAC,CAAC,CAAC,SAAS;SACd,CAAC;IACJ,CAAC;CACF;AA5DD,0CA4DC;AAED,SAAgB,eAAe,CAC7B,QAAsB,EACtB,MAAc,EACd,OAAe,EACf,KAAa;IAEb,IAAI,QAAQ,KAAK,WAAW,EAAE,CAAC;QAC7B,OAAO,IAAI,eAAe,CAAC,MAAM,EAAE,OAAO,EAAE,KAAK,CAAC,CAAC;IACrD,CAAC;IACD,OAAO,IAAI,sBAAsB,CAAC,MAAM,EAAE,OAAO,EAAE,KAAK,CAAC,CAAC;AAC5D,CAAC"}
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
import { LLMClient } from './client.js';
|
|
2
|
+
import { Benchmark } from './benchmarks.js';
|
|
3
|
+
export interface EvaluationResponse {
|
|
4
|
+
content: string;
|
|
5
|
+
finishReason: string;
|
|
6
|
+
usage?: {
|
|
7
|
+
promptTokens: number;
|
|
8
|
+
completionTokens: number;
|
|
9
|
+
totalTokens: number;
|
|
10
|
+
};
|
|
11
|
+
}
|
|
12
|
+
export interface EvaluationResult {
|
|
13
|
+
benchmark: string;
|
|
14
|
+
model: string;
|
|
15
|
+
total: number;
|
|
16
|
+
correct: number;
|
|
17
|
+
accuracy: number;
|
|
18
|
+
timestamp: string;
|
|
19
|
+
seed?: number;
|
|
20
|
+
judge?: string;
|
|
21
|
+
}
|
|
22
|
+
interface EvaluatorOptions {
|
|
23
|
+
timeout: number;
|
|
24
|
+
retries: number;
|
|
25
|
+
temperature: number;
|
|
26
|
+
judgeTemperature?: number;
|
|
27
|
+
}
|
|
28
|
+
export declare class Evaluator {
|
|
29
|
+
private client;
|
|
30
|
+
private judgeClient;
|
|
31
|
+
private model;
|
|
32
|
+
private options;
|
|
33
|
+
constructor(client: LLMClient, options?: Partial<EvaluatorOptions>, judgeClient?: LLMClient);
|
|
34
|
+
private buildPrompt;
|
|
35
|
+
private stripThinkingTags;
|
|
36
|
+
evaluate(benchmark: Benchmark, item: any): Promise<EvaluationResponse>;
|
|
37
|
+
checkAnswer(benchmark: Benchmark, item: any, response: EvaluationResponse): boolean;
|
|
38
|
+
evaluateAndCheckWithJudge(benchmark: Benchmark, item: any, modelResponse: EvaluationResponse): Promise<{
|
|
39
|
+
correct: boolean;
|
|
40
|
+
judgeResponse?: string;
|
|
41
|
+
}>;
|
|
42
|
+
private checkMathAnswer;
|
|
43
|
+
private extractFinalAnswer;
|
|
44
|
+
private extractNumber;
|
|
45
|
+
private checkCodeAnswer;
|
|
46
|
+
private checkTypeScriptAnswer;
|
|
47
|
+
private checkSqlAnswer;
|
|
48
|
+
private extractCode;
|
|
49
|
+
private checkMultipleChoice;
|
|
50
|
+
private checkTruthfulQA;
|
|
51
|
+
private checkBashAnswer;
|
|
52
|
+
private checkBBHAnswer;
|
|
53
|
+
private getMaxTokens;
|
|
54
|
+
private sleep;
|
|
55
|
+
}
|
|
56
|
+
export {};
|
|
57
|
+
//# sourceMappingURL=evaluator.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"evaluator.d.ts","sourceRoot":"","sources":["../src/evaluator.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,SAAS,EAAE,MAAM,aAAa,CAAC;AACxC,OAAO,EAAE,SAAS,EAAE,MAAM,iBAAiB,CAAC;AAG5C,MAAM,WAAW,kBAAkB;IACjC,OAAO,EAAE,MAAM,CAAC;IAChB,YAAY,EAAE,MAAM,CAAC;IACrB,KAAK,CAAC,EAAE;QACN,YAAY,EAAE,MAAM,CAAC;QACrB,gBAAgB,EAAE,MAAM,CAAC;QACzB,WAAW,EAAE,MAAM,CAAC;KACrB,CAAC;CACH;AAED,MAAM,WAAW,gBAAgB;IAC/B,SAAS,EAAE,MAAM,CAAC;IAClB,KAAK,EAAE,MAAM,CAAC;IACd,KAAK,EAAE,MAAM,CAAC;IACd,OAAO,EAAE,MAAM,CAAC;IAChB,QAAQ,EAAE,MAAM,CAAC;IACjB,SAAS,EAAE,MAAM,CAAC;IAClB,IAAI,CAAC,EAAE,MAAM,CAAC;IACd,KAAK,CAAC,EAAE,MAAM,CAAC;CAChB;AAED,UAAU,gBAAgB;IACxB,OAAO,EAAE,MAAM,CAAC;IAChB,OAAO,EAAE,MAAM,CAAC;IAChB,WAAW,EAAE,MAAM,CAAC;IACpB,gBAAgB,CAAC,EAAE,MAAM,CAAC;CAC3B;AAED,qBAAa,SAAS;IACpB,OAAO,CAAC,MAAM,CAAY;IAC1B,OAAO,CAAC,WAAW,CAAmB;IACtC,OAAO,CAAC,KAAK,CAAS;IACtB,OAAO,CAAC,OAAO,CAAmB;gBAEtB,MAAM,EAAE,SAAS,EAAE,OAAO,GAAE,OAAO,CAAC,gBAAgB,CAAM,EAAE,WAAW,CAAC,EAAE,SAAS;IAY/F,OAAO,CAAC,WAAW;IAsBnB,OAAO,CAAC,iBAAiB;IAOnB,QAAQ,CAAC,SAAS,EAAE,SAAS,EAAE,IAAI,EAAE,GAAG,GAAG,OAAO,CAAC,kBAAkB,CAAC;IA+B5E,WAAW,CAAC,SAAS,EAAE,SAAS,EAAE,IAAI,EAAE,GAAG,EAAE,QAAQ,EAAE,kBAAkB,GAAG,OAAO;IAgC7E,yBAAyB,CAC7B,SAAS,EAAE,SAAS,EACpB,IAAI,EAAE,GAAG,EACT,aAAa,EAAE,kBAAkB,GAChC,OAAO,CAAC;QAAE,OAAO,EAAE,OAAO,CAAC;QAAC,aAAa,CAAC,EAAE,MAAM,CAAA;KAAE,CAAC;IAsFxD,OAAO,CAAC,eAAe;IAcvB,OAAO,CAAC,kBAAkB;IAW1B,OAAO,CAAC,aAAa;IAKrB,OAAO,CAAC,eAAe;IAgBvB,OAAO,CAAC,qBAAqB;IAW7B,OAAO,CAAC,cAAc;IAiBtB,OAAO,CAAC,WAAW;IA2BnB,OAAO,CAAC,mBAAmB;IA2D3B,OAAO,CAAC,eAAe;IAuBvB,OAAO,CAAC,eAAe;IAmBvB,OAAO,CAAC,cAAc;IAkEtB,OAAO,CAAC,YAAY;IAIpB,OAAO,CAAC,KAAK;CAGd"}
|
|
@@ -0,0 +1,410 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
exports.Evaluator = void 0;
|
|
4
|
+
const child_process_1 = require("child_process");
|
|
5
|
+
class Evaluator {
|
|
6
|
+
client;
|
|
7
|
+
judgeClient;
|
|
8
|
+
model;
|
|
9
|
+
options;
|
|
10
|
+
constructor(client, options = {}, judgeClient) {
|
|
11
|
+
this.client = client;
|
|
12
|
+
this.judgeClient = judgeClient || null;
|
|
13
|
+
this.model = client.model;
|
|
14
|
+
this.options = {
|
|
15
|
+
timeout: options.timeout ?? 120,
|
|
16
|
+
retries: options.retries ?? 3,
|
|
17
|
+
temperature: options.temperature ?? 0,
|
|
18
|
+
judgeTemperature: options.judgeTemperature ?? 0,
|
|
19
|
+
};
|
|
20
|
+
}
|
|
21
|
+
buildPrompt(benchmark, item) {
|
|
22
|
+
const template = benchmark.promptTemplate || '{question}';
|
|
23
|
+
// Replace common placeholders
|
|
24
|
+
let prompt = template
|
|
25
|
+
.replace('{question}', item.question || item.problem || item.prompt || '')
|
|
26
|
+
.replace('{text}', item.text || item.question || item.problem || item.prompt || '')
|
|
27
|
+
.replace('{prompt}', item.prompt || item.question || item.problem || '');
|
|
28
|
+
if (item.choices) {
|
|
29
|
+
const choices = Array.isArray(item.choices)
|
|
30
|
+
? item.choices
|
|
31
|
+
: Object.values(item.choices);
|
|
32
|
+
const choicesText = choices
|
|
33
|
+
.map((c, i) => `${String.fromCharCode(65 + i)}. ${c}`)
|
|
34
|
+
.join('\n');
|
|
35
|
+
prompt = prompt.replace('{choices}', choicesText);
|
|
36
|
+
}
|
|
37
|
+
return prompt;
|
|
38
|
+
}
|
|
39
|
+
stripThinkingTags(content) {
|
|
40
|
+
return content
|
|
41
|
+
.replace(/<think>[\s\S]*?<\/think>/gi, '')
|
|
42
|
+
.replace(/<think>[\s\S]*?<\/thinking>/gi, '')
|
|
43
|
+
.trim();
|
|
44
|
+
}
|
|
45
|
+
async evaluate(benchmark, item) {
|
|
46
|
+
const prompt = this.buildPrompt(benchmark, item);
|
|
47
|
+
let lastError = null;
|
|
48
|
+
for (let attempt = 0; attempt < this.options.retries; attempt++) {
|
|
49
|
+
try {
|
|
50
|
+
const response = await this.client.chat([{ role: 'user', content: prompt }], {
|
|
51
|
+
temperature: this.options.temperature,
|
|
52
|
+
maxTokens: this.getMaxTokens(benchmark),
|
|
53
|
+
});
|
|
54
|
+
return {
|
|
55
|
+
content: response.content,
|
|
56
|
+
finishReason: response.finishReason,
|
|
57
|
+
usage: response.usage,
|
|
58
|
+
};
|
|
59
|
+
}
|
|
60
|
+
catch (error) {
|
|
61
|
+
lastError = error;
|
|
62
|
+
if (attempt < this.options.retries - 1) {
|
|
63
|
+
const delay = Math.pow(2, attempt) * 1000;
|
|
64
|
+
await this.sleep(delay);
|
|
65
|
+
}
|
|
66
|
+
}
|
|
67
|
+
}
|
|
68
|
+
throw lastError || new Error('Evaluation failed');
|
|
69
|
+
}
|
|
70
|
+
checkAnswer(benchmark, item, response) {
|
|
71
|
+
const answerField = benchmark.answerField || 'answer';
|
|
72
|
+
const correctAnswer = item[answerField];
|
|
73
|
+
if (correctAnswer === undefined || correctAnswer === null)
|
|
74
|
+
return false;
|
|
75
|
+
const responseText = response.content.trim();
|
|
76
|
+
switch (benchmark.type) {
|
|
77
|
+
case 'math_reasoning':
|
|
78
|
+
return this.checkMathAnswer(responseText, correctAnswer);
|
|
79
|
+
case 'code':
|
|
80
|
+
return this.checkCodeAnswer(responseText, correctAnswer);
|
|
81
|
+
case 'typescript':
|
|
82
|
+
return this.checkTypeScriptAnswer(responseText, correctAnswer);
|
|
83
|
+
case 'sql':
|
|
84
|
+
return this.checkSqlAnswer(responseText, correctAnswer);
|
|
85
|
+
case 'truthfulqa':
|
|
86
|
+
return this.checkTruthfulQA(responseText, correctAnswer, item);
|
|
87
|
+
case 'commonsense':
|
|
88
|
+
case 'reasoning':
|
|
89
|
+
case 'knowledge':
|
|
90
|
+
return this.checkMultipleChoice(responseText, correctAnswer, item);
|
|
91
|
+
case 'bbh':
|
|
92
|
+
return this.checkBBHAnswer(responseText, correctAnswer);
|
|
93
|
+
case 'terminal':
|
|
94
|
+
return this.checkBashAnswer(responseText, correctAnswer);
|
|
95
|
+
default:
|
|
96
|
+
return responseText.toLowerCase().includes(correctAnswer.toLowerCase());
|
|
97
|
+
}
|
|
98
|
+
}
|
|
99
|
+
async evaluateAndCheckWithJudge(benchmark, item, modelResponse) {
|
|
100
|
+
if (!benchmark.useJudge || !this.judgeClient) {
|
|
101
|
+
const correct = this.checkAnswer(benchmark, item, modelResponse);
|
|
102
|
+
return { correct, judgeResponse: undefined };
|
|
103
|
+
}
|
|
104
|
+
let judgePrompt = benchmark.judgePromptTemplate || '';
|
|
105
|
+
const cleanResponse = this.stripThinkingTags(modelResponse.content);
|
|
106
|
+
// Build prompt based on benchmark type
|
|
107
|
+
switch (benchmark.id) {
|
|
108
|
+
case 'truthfulqa': {
|
|
109
|
+
const labeledChoices = item.labeled_choices || [];
|
|
110
|
+
judgePrompt = judgePrompt
|
|
111
|
+
.replace('{question}', item.question || '')
|
|
112
|
+
.replace('{all_choices}', labeledChoices.join('\n'))
|
|
113
|
+
.replace('{model_response}', cleanResponse);
|
|
114
|
+
break;
|
|
115
|
+
}
|
|
116
|
+
case 'humaneval':
|
|
117
|
+
case 'mbpp': {
|
|
118
|
+
const question = item.question || item.text || item.prompt || '';
|
|
119
|
+
const testCases = item.test_cases || item.test_list || [];
|
|
120
|
+
const testStr = Array.isArray(testCases)
|
|
121
|
+
? testCases.map((tc, i) => {
|
|
122
|
+
const input = tc.split('===')[0]?.trim() || '';
|
|
123
|
+
const expected = tc.split('===')[1]?.trim() || '';
|
|
124
|
+
return `Test ${i + 1}: Input: ${input} Expected: ${expected}`;
|
|
125
|
+
}).join('\n')
|
|
126
|
+
: JSON.stringify(testCases);
|
|
127
|
+
judgePrompt = judgePrompt
|
|
128
|
+
.replace('{question}', question)
|
|
129
|
+
.replace('{test_cases}', testStr)
|
|
130
|
+
.replace('{model_response}', cleanResponse);
|
|
131
|
+
break;
|
|
132
|
+
}
|
|
133
|
+
case 'nl2bash': {
|
|
134
|
+
const answerField = benchmark.answerField || 'cmd';
|
|
135
|
+
judgePrompt = judgePrompt
|
|
136
|
+
.replace('{question}', item.question || '')
|
|
137
|
+
.replace('{correct_answer}', item[answerField] || '')
|
|
138
|
+
.replace('{model_response}', cleanResponse);
|
|
139
|
+
break;
|
|
140
|
+
}
|
|
141
|
+
case 'spider': {
|
|
142
|
+
const answerField = benchmark.answerField || 'query';
|
|
143
|
+
judgePrompt = judgePrompt
|
|
144
|
+
.replace('{question}', item.question || '')
|
|
145
|
+
.replace('{correct_answer}', item[answerField] || '')
|
|
146
|
+
.replace('{model_response}', cleanResponse);
|
|
147
|
+
break;
|
|
148
|
+
}
|
|
149
|
+
case 'math': {
|
|
150
|
+
const answerField = benchmark.answerField || 'answer';
|
|
151
|
+
judgePrompt = judgePrompt
|
|
152
|
+
.replace('{question}', item.question || '')
|
|
153
|
+
.replace('{correct_answer}', item[answerField] || '')
|
|
154
|
+
.replace('{model_response}', cleanResponse);
|
|
155
|
+
break;
|
|
156
|
+
}
|
|
157
|
+
default: {
|
|
158
|
+
judgePrompt = judgePrompt
|
|
159
|
+
.replace('{question}', item.question || '')
|
|
160
|
+
.replace('{model_response}', cleanResponse);
|
|
161
|
+
}
|
|
162
|
+
}
|
|
163
|
+
try {
|
|
164
|
+
const judgeResponse = await this.judgeClient.chat([{ role: 'user', content: judgePrompt }], { temperature: this.options.judgeTemperature ?? 0, maxTokens: 2000 });
|
|
165
|
+
const judgeAnswer = judgeResponse.content.trim().toUpperCase();
|
|
166
|
+
const isCorrect = judgeAnswer.includes('YES');
|
|
167
|
+
return {
|
|
168
|
+
correct: isCorrect,
|
|
169
|
+
judgeResponse: judgeResponse.content
|
|
170
|
+
};
|
|
171
|
+
}
|
|
172
|
+
catch (error) {
|
|
173
|
+
console.error('Judge evaluation failed:', error);
|
|
174
|
+
return { correct: false, judgeResponse: `Error: ${error}` };
|
|
175
|
+
}
|
|
176
|
+
}
|
|
177
|
+
checkMathAnswer(response, correctAnswer) {
|
|
178
|
+
const extractedResponse = this.extractFinalAnswer(response);
|
|
179
|
+
const extractedAnswer = this.extractFinalAnswer(correctAnswer);
|
|
180
|
+
const responseNum = this.extractNumber(extractedResponse);
|
|
181
|
+
const answerNum = this.extractNumber(extractedAnswer);
|
|
182
|
+
if (responseNum !== null && answerNum !== null) {
|
|
183
|
+
return Math.abs(responseNum - answerNum) < 0.01;
|
|
184
|
+
}
|
|
185
|
+
return extractedResponse.toLowerCase() === extractedAnswer.toLowerCase();
|
|
186
|
+
}
|
|
187
|
+
extractFinalAnswer(text) {
|
|
188
|
+
if (text.includes('####')) {
|
|
189
|
+
return text.split('####').pop()?.trim() || text;
|
|
190
|
+
}
|
|
191
|
+
const numbers = text.match(/-?\d+\.?\d*/g);
|
|
192
|
+
if (numbers && numbers.length > 0) {
|
|
193
|
+
return numbers[numbers.length - 1];
|
|
194
|
+
}
|
|
195
|
+
return text.trim();
|
|
196
|
+
}
|
|
197
|
+
extractNumber(text) {
|
|
198
|
+
const match = text.match(/-?\d+\.?\d*/);
|
|
199
|
+
return match ? parseFloat(match[0]) : null;
|
|
200
|
+
}
|
|
201
|
+
checkCodeAnswer(response, correctAnswer) {
|
|
202
|
+
const responseCode = this.extractCode(response);
|
|
203
|
+
if (!responseCode || responseCode.length === 0) {
|
|
204
|
+
return false;
|
|
205
|
+
}
|
|
206
|
+
try {
|
|
207
|
+
const encoded = Buffer.from(responseCode).toString('base64');
|
|
208
|
+
(0, child_process_1.execSync)(`python3 -c "import base64; code=base64.b64decode('${encoded}').decode(); compile(code,'<string>','exec')"`, { stdio: 'pipe' });
|
|
209
|
+
return true;
|
|
210
|
+
}
|
|
211
|
+
catch {
|
|
212
|
+
return false;
|
|
213
|
+
}
|
|
214
|
+
}
|
|
215
|
+
checkTypeScriptAnswer(response, correctAnswer) {
|
|
216
|
+
// For TypeScript, check if response contains TypeScript-like code
|
|
217
|
+
// (function declaration with types)
|
|
218
|
+
const hasFunction = /function\s+\w+/.test(response);
|
|
219
|
+
const hasArrowFunction = /=>\s*{/.test(response);
|
|
220
|
+
const hasTypeScriptTypes = /:\s*(number|string|boolean|any|void|never)/.test(response);
|
|
221
|
+
const hasExport = /export\s+/.test(response);
|
|
222
|
+
return hasFunction || hasArrowFunction;
|
|
223
|
+
}
|
|
224
|
+
checkSqlAnswer(response, correctAnswer) {
|
|
225
|
+
// For SQL, check if response contains SQL keywords
|
|
226
|
+
const hasSelect = /SELECT/i.test(response);
|
|
227
|
+
const hasFrom = /FROM/i.test(response);
|
|
228
|
+
// Extract SQL query from response if it's in a code block
|
|
229
|
+
const sqlMatch = response.match(/```sql\n?([\s\S]*?)```/i) ||
|
|
230
|
+
response.match(/```\n?([\s\S]*?)```/i);
|
|
231
|
+
if (sqlMatch) {
|
|
232
|
+
const extractedSql = sqlMatch[1].trim().toUpperCase();
|
|
233
|
+
return /SELECT/.test(extractedSql) && /FROM/.test(extractedSql);
|
|
234
|
+
}
|
|
235
|
+
return hasSelect && hasFrom;
|
|
236
|
+
}
|
|
237
|
+
extractCode(text) {
|
|
238
|
+
// Try to find code blocks first
|
|
239
|
+
const codeBlockMatch = text.match(/```(?:\w+)?\n([\s\S]*?)```/);
|
|
240
|
+
if (codeBlockMatch)
|
|
241
|
+
return codeBlockMatch[1].trim();
|
|
242
|
+
// Try to find Python function definitions and extract from there
|
|
243
|
+
const funcMatch = text.match(/(?:^|\n)(def \w+.*?(?=\n(?:[^\s]|$)))/);
|
|
244
|
+
if (funcMatch) {
|
|
245
|
+
return text.slice(text.indexOf(funcMatch[1])).trim();
|
|
246
|
+
}
|
|
247
|
+
// If no code block or function found, try to find a line that looks like Python code
|
|
248
|
+
const lines = text.split('\n');
|
|
249
|
+
const codeStartIdx = lines.findIndex(line => line.trim().startsWith('def ') ||
|
|
250
|
+
line.trim().startsWith('class ') ||
|
|
251
|
+
line.trim().startsWith('import ') ||
|
|
252
|
+
line.trim().startsWith('from '));
|
|
253
|
+
if (codeStartIdx >= 0) {
|
|
254
|
+
return lines.slice(codeStartIdx).join('\n').trim();
|
|
255
|
+
}
|
|
256
|
+
return text.trim();
|
|
257
|
+
}
|
|
258
|
+
checkMultipleChoice(response, correctAnswer, item) {
|
|
259
|
+
const normalizedResponse = response.toUpperCase().trim();
|
|
260
|
+
// Handle numeric answer (e.g., MMLU returns 0, 1, 2, 3 as numbers)
|
|
261
|
+
let answerStr = String(correctAnswer);
|
|
262
|
+
let answerIndex;
|
|
263
|
+
if (/^[0-3]$/.test(answerStr)) {
|
|
264
|
+
// Numeric index (0-3)
|
|
265
|
+
answerIndex = parseInt(answerStr);
|
|
266
|
+
answerStr = String.fromCharCode(65 + answerIndex); // Convert to A, B, C, D
|
|
267
|
+
}
|
|
268
|
+
else {
|
|
269
|
+
// Letter answer (A, B, C, D)
|
|
270
|
+
answerStr = answerStr.toUpperCase().trim();
|
|
271
|
+
answerIndex = answerStr.charCodeAt(0) - 65;
|
|
272
|
+
}
|
|
273
|
+
const normalizedAnswer = answerStr;
|
|
274
|
+
// Direct match
|
|
275
|
+
if (normalizedResponse === normalizedAnswer)
|
|
276
|
+
return true;
|
|
277
|
+
// Single letter response (e.g., "A")
|
|
278
|
+
if (normalizedResponse.length === 1 && /^[A-D]$/.test(normalizedResponse)) {
|
|
279
|
+
return normalizedResponse === normalizedAnswer;
|
|
280
|
+
}
|
|
281
|
+
// Extract letter from various patterns
|
|
282
|
+
// "Choice A", "choice A", "Option B", "option B"
|
|
283
|
+
let letterMatch = normalizedResponse.match(/\b(CHOICE|OPTION)\s+([A-D])\b/);
|
|
284
|
+
if (letterMatch)
|
|
285
|
+
return letterMatch[2] === normalizedAnswer;
|
|
286
|
+
// "Answer is A", "answer is A", "is A"
|
|
287
|
+
letterMatch = normalizedResponse.match(/\b(ANSWER|IS)\s+([A-D])\b/);
|
|
288
|
+
if (letterMatch)
|
|
289
|
+
return letterMatch[2] === normalizedAnswer;
|
|
290
|
+
// "The correct answer is B"
|
|
291
|
+
letterMatch = normalizedResponse.match(/\bCORRECT\b.*?\b([A-D])\b/);
|
|
292
|
+
if (letterMatch)
|
|
293
|
+
return letterMatch[1] === normalizedAnswer;
|
|
294
|
+
// Single letter A, B, C, or D surrounded by word boundaries
|
|
295
|
+
letterMatch = normalizedResponse.match(/\b([A-D])\b/);
|
|
296
|
+
if (letterMatch)
|
|
297
|
+
return letterMatch[1] === normalizedAnswer;
|
|
298
|
+
const choices = Array.isArray(item.choices)
|
|
299
|
+
? item.choices
|
|
300
|
+
: item.choices
|
|
301
|
+
? Object.values(item.choices)
|
|
302
|
+
: [];
|
|
303
|
+
if (answerIndex >= 0 && answerIndex < choices.length) {
|
|
304
|
+
const expectedChoice = String(choices[answerIndex]).toUpperCase().trim();
|
|
305
|
+
// Check if response contains the expected choice or vice versa
|
|
306
|
+
return normalizedResponse.includes(expectedChoice) || expectedChoice.includes(normalizedResponse);
|
|
307
|
+
}
|
|
308
|
+
return false;
|
|
309
|
+
}
|
|
310
|
+
checkTruthfulQA(response, correctAnswer, item) {
|
|
311
|
+
const normalizedResponse = response.toUpperCase().trim();
|
|
312
|
+
// Get correct choice text
|
|
313
|
+
const answerIndex = typeof correctAnswer === 'number' ? correctAnswer : parseInt(correctAnswer);
|
|
314
|
+
const choices = Array.isArray(item.choices) ? item.choices : [];
|
|
315
|
+
if (answerIndex < 0 || answerIndex >= choices.length)
|
|
316
|
+
return false;
|
|
317
|
+
const correctChoice = choices[answerIndex].toUpperCase().trim();
|
|
318
|
+
// Check if response contains the correct answer or vice versa
|
|
319
|
+
if (normalizedResponse.includes(correctChoice) || correctChoice.includes(normalizedResponse)) {
|
|
320
|
+
return true;
|
|
321
|
+
}
|
|
322
|
+
// Check if it's the same first 20+ characters
|
|
323
|
+
if (normalizedResponse.slice(0, 20) === correctChoice.slice(0, 20)) {
|
|
324
|
+
return true;
|
|
325
|
+
}
|
|
326
|
+
return false;
|
|
327
|
+
}
|
|
328
|
+
checkBashAnswer(response, correctAnswer) {
|
|
329
|
+
const normalizedResponse = response.toLowerCase().trim();
|
|
330
|
+
const normalizedAnswer = correctAnswer.toLowerCase().trim();
|
|
331
|
+
// Exact match
|
|
332
|
+
if (normalizedResponse === normalizedAnswer)
|
|
333
|
+
return true;
|
|
334
|
+
// Check if main command is present (partial credit)
|
|
335
|
+
const answerParts = normalizedAnswer.split(/\s+/);
|
|
336
|
+
if (answerParts.length > 0) {
|
|
337
|
+
const mainCmd = answerParts[0];
|
|
338
|
+
if (mainCmd && normalizedResponse.includes(mainCmd)) {
|
|
339
|
+
return true; // Partial credit - main command matches
|
|
340
|
+
}
|
|
341
|
+
}
|
|
342
|
+
return false;
|
|
343
|
+
}
|
|
344
|
+
checkBBHAnswer(response, correctAnswer) {
|
|
345
|
+
// Strip thinking tags
|
|
346
|
+
const strippedResponse = response
|
|
347
|
+
.replace(/<think>[\s\S]*?<\/think>/gi, '')
|
|
348
|
+
.replace(/<think>[\s\S]*?<\/thinking>/gi, '')
|
|
349
|
+
.trim();
|
|
350
|
+
// Extract final answer after #### if present
|
|
351
|
+
let finalAnswer = strippedResponse;
|
|
352
|
+
if (strippedResponse.includes('####')) {
|
|
353
|
+
finalAnswer = strippedResponse.split('####').pop()?.trim() || strippedResponse;
|
|
354
|
+
}
|
|
355
|
+
const normalizedResponse = finalAnswer.toLowerCase().trim();
|
|
356
|
+
const normalizedAnswer = correctAnswer.toLowerCase().trim();
|
|
357
|
+
// Exact match
|
|
358
|
+
if (normalizedResponse === normalizedAnswer)
|
|
359
|
+
return true;
|
|
360
|
+
// Handle True/False style answers
|
|
361
|
+
if (normalizedAnswer === 'true' || normalizedAnswer === 'false') {
|
|
362
|
+
if (normalizedResponse === 'true' || normalizedResponse === 'false') {
|
|
363
|
+
return normalizedResponse === normalizedAnswer;
|
|
364
|
+
}
|
|
365
|
+
}
|
|
366
|
+
// Handle Yes/No style answers
|
|
367
|
+
if (normalizedAnswer === 'yes' || normalizedAnswer === 'no') {
|
|
368
|
+
if (normalizedResponse === 'yes' || normalizedResponse === 'no') {
|
|
369
|
+
return normalizedResponse === normalizedAnswer;
|
|
370
|
+
}
|
|
371
|
+
}
|
|
372
|
+
// Handle A/B/C/D style answers (multiple choice)
|
|
373
|
+
if (/^[a-d]$/i.test(normalizedAnswer)) {
|
|
374
|
+
const answerChar = normalizedAnswer.toUpperCase();
|
|
375
|
+
if (normalizedResponse === answerChar)
|
|
376
|
+
return true;
|
|
377
|
+
// Check if response contains the letter
|
|
378
|
+
if (new RegExp(`\\b${answerChar}\\b`).test(normalizedResponse))
|
|
379
|
+
return true;
|
|
380
|
+
}
|
|
381
|
+
// Handle numeric answers
|
|
382
|
+
const responseNum = normalizedResponse.match(/-?\d+\.?\d*/);
|
|
383
|
+
const answerNum = normalizedAnswer.match(/-?\d+\.?\d*/);
|
|
384
|
+
if (responseNum && answerNum) {
|
|
385
|
+
return parseFloat(responseNum[0]) === parseFloat(answerNum[0]);
|
|
386
|
+
}
|
|
387
|
+
// Handle comma-separated lists (like word sorting)
|
|
388
|
+
if (normalizedAnswer.includes(',') && normalizedResponse.includes(',')) {
|
|
389
|
+
const respSet = new Set(normalizedResponse.split(',').map(s => s.trim()));
|
|
390
|
+
const ansSet = new Set(normalizedAnswer.split(',').map(s => s.trim()));
|
|
391
|
+
// Check if sets are equal
|
|
392
|
+
if (respSet.size === ansSet.size && [...respSet].every(x => ansSet.has(x))) {
|
|
393
|
+
return true;
|
|
394
|
+
}
|
|
395
|
+
}
|
|
396
|
+
// Partial match - check if the answer appears in the response
|
|
397
|
+
if (normalizedResponse.includes(normalizedAnswer) || normalizedAnswer.includes(normalizedResponse)) {
|
|
398
|
+
return true;
|
|
399
|
+
}
|
|
400
|
+
return false;
|
|
401
|
+
}
|
|
402
|
+
getMaxTokens(benchmark) {
|
|
403
|
+
return 100000;
|
|
404
|
+
}
|
|
405
|
+
sleep(ms) {
|
|
406
|
+
return new Promise((resolve) => setTimeout(resolve, ms));
|
|
407
|
+
}
|
|
408
|
+
}
|
|
409
|
+
exports.Evaluator = Evaluator;
|
|
410
|
+
//# sourceMappingURL=evaluator.js.map
|