llmtester 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,69 @@
1
+ export type ProviderType = 'openai' | 'anthropic' | 'custom';
2
+ export interface LLMClient {
3
+ provider: ProviderType;
4
+ baseUrl: string;
5
+ model: string;
6
+ chat(messages: {
7
+ role: string;
8
+ content: string;
9
+ }[], options?: {
10
+ temperature?: number;
11
+ maxTokens?: number;
12
+ }): Promise<{
13
+ content: string;
14
+ finishReason: string;
15
+ usage?: {
16
+ promptTokens: number;
17
+ completionTokens: number;
18
+ totalTokens: number;
19
+ };
20
+ }>;
21
+ }
22
+ export declare class OpenAICompatibleClient implements LLMClient {
23
+ provider: ProviderType;
24
+ baseUrl: string;
25
+ model: string;
26
+ private apiKey;
27
+ private httpClient;
28
+ constructor(apiKey: string, baseUrl: string, model: string);
29
+ chat(messages: {
30
+ role: string;
31
+ content: string;
32
+ }[], options?: {
33
+ temperature?: number;
34
+ maxTokens?: number;
35
+ }): Promise<{
36
+ content: string;
37
+ finishReason: string;
38
+ usage?: {
39
+ promptTokens: number;
40
+ completionTokens: number;
41
+ totalTokens: number;
42
+ };
43
+ }>;
44
+ }
45
+ export declare class AnthropicClient implements LLMClient {
46
+ provider: ProviderType;
47
+ baseUrl: string;
48
+ model: string;
49
+ private apiKey;
50
+ private httpClient;
51
+ constructor(apiKey: string, baseUrl: string, model: string);
52
+ chat(messages: {
53
+ role: string;
54
+ content: string;
55
+ }[], options?: {
56
+ temperature?: number;
57
+ maxTokens?: number;
58
+ }): Promise<{
59
+ content: string;
60
+ finishReason: string;
61
+ usage?: {
62
+ promptTokens: number;
63
+ completionTokens: number;
64
+ totalTokens: number;
65
+ };
66
+ }>;
67
+ }
68
+ export declare function createLLMClient(provider: ProviderType, apiKey: string, baseUrl: string, model: string): LLMClient;
69
+ //# sourceMappingURL=client.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"client.d.ts","sourceRoot":"","sources":["../src/client.ts"],"names":[],"mappings":"AAEA,MAAM,MAAM,YAAY,GAAG,QAAQ,GAAG,WAAW,GAAG,QAAQ,CAAC;AAE7D,MAAM,WAAW,SAAS;IACxB,QAAQ,EAAE,YAAY,CAAC;IACvB,OAAO,EAAE,MAAM,CAAC;IAChB,KAAK,EAAE,MAAM,CAAC;IACd,IAAI,CAAC,QAAQ,EAAE;QAAE,IAAI,EAAE,MAAM,CAAC;QAAC,OAAO,EAAE,MAAM,CAAA;KAAE,EAAE,EAAE,OAAO,CAAC,EAAE;QAC5D,WAAW,CAAC,EAAE,MAAM,CAAC;QACrB,SAAS,CAAC,EAAE,MAAM,CAAC;KACpB,GAAG,OAAO,CAAC;QACV,OAAO,EAAE,MAAM,CAAC;QAChB,YAAY,EAAE,MAAM,CAAC;QACrB,KAAK,CAAC,EAAE;YAAE,YAAY,EAAE,MAAM,CAAC;YAAC,gBAAgB,EAAE,MAAM,CAAC;YAAC,WAAW,EAAE,MAAM,CAAA;SAAE,CAAC;KACjF,CAAC,CAAC;CACJ;AAED,qBAAa,sBAAuB,YAAW,SAAS;IAC/C,QAAQ,EAAE,YAAY,CAAY;IAClC,OAAO,EAAE,MAAM,CAAC;IAChB,KAAK,EAAE,MAAM,CAAC;IACrB,OAAO,CAAC,MAAM,CAAS;IACvB,OAAO,CAAC,UAAU,CAAgB;gBAEtB,MAAM,EAAE,MAAM,EAAE,OAAO,EAAE,MAAM,EAAE,KAAK,EAAE,MAAM;IAcpD,IAAI,CAAC,QAAQ,EAAE;QAAE,IAAI,EAAE,MAAM,CAAC;QAAC,OAAO,EAAE,MAAM,CAAA;KAAE,EAAE,EAAE,OAAO,CAAC,EAAE;QAClE,WAAW,CAAC,EAAE,MAAM,CAAC;QACrB,SAAS,CAAC,EAAE,MAAM,CAAC;KACpB,GAAG,OAAO,CAAC;QACV,OAAO,EAAE,MAAM,CAAC;QAChB,YAAY,EAAE,MAAM,CAAC;QACrB,KAAK,CAAC,EAAE;YAAE,YAAY,EAAE,MAAM,CAAC;YAAC,gBAAgB,EAAE,MAAM,CAAC;YAAC,WAAW,EAAE,MAAM,CAAA;SAAE,CAAC;KACjF,CAAC;CAmBH;AAED,qBAAa,eAAgB,YAAW,SAAS;IACxC,QAAQ,EAAE,YAAY,CAAe;IACrC,OAAO,EAAE,MAAM,CAAC;IAChB,KAAK,EAAE,MAAM,CAAC;IACrB,OAAO,CAAC,MAAM,CAAS;IACvB,OAAO,CAAC,UAAU,CAAgB;gBAEtB,MAAM,EAAE,MAAM,EAAE,OAAO,EAAE,MAAM,EAAE,KAAK,EAAE,MAAM;IAgBpD,IAAI,CAAC,QAAQ,EAAE;QAAE,IAAI,EAAE,MAAM,CAAC;QAAC,OAAO,EAAE,MAAM,CAAA;KAAE,EAAE,EAAE,OAAO,CAAC,EAAE;QAClE,WAAW,CAAC,EAAE,MAAM,CAAC;QACrB,SAAS,CAAC,EAAE,MAAM,CAAC;KACpB,GAAG,OAAO,CAAC;QACV,OAAO,EAAE,MAAM,CAAC;QAChB,YAAY,EAAE,MAAM,CAAC;QACrB,KAAK,CAAC,EAAE;YAAE,YAAY,EAAE,MAAM,CAAC;YAAC,gBAAgB,EAAE,MAAM,CAAC;YAAC,WAAW,EAAE,MAAM,CAAA;SAAE,CAAC;KACjF,CAAC;CA8BH;AAED,wBAAgB,eAAe,CAC7B,QAAQ,EAAE,YAAY,EACtB,MAAM,EAAE,MAAM,EACd,OAAO,EAAE,MAAM,EACf,KAAK,EAAE,MAAM,GACZ,SAAS,CAKX"}
package/dist/client.js ADDED
@@ -0,0 +1,103 @@
1
+ "use strict";
2
+ var __importDefault = (this && this.__importDefault) || function (mod) {
3
+ return (mod && mod.__esModule) ? mod : { "default": mod };
4
+ };
5
+ Object.defineProperty(exports, "__esModule", { value: true });
6
+ exports.AnthropicClient = exports.OpenAICompatibleClient = void 0;
7
+ exports.createLLMClient = createLLMClient;
8
+ const axios_1 = __importDefault(require("axios"));
9
+ class OpenAICompatibleClient {
10
+ provider = 'openai';
11
+ baseUrl;
12
+ model;
13
+ apiKey;
14
+ httpClient;
15
+ constructor(apiKey, baseUrl, model) {
16
+ this.apiKey = apiKey;
17
+ this.baseUrl = baseUrl;
18
+ this.model = model;
19
+ this.httpClient = axios_1.default.create({
20
+ baseURL: baseUrl,
21
+ headers: {
22
+ 'Authorization': `Bearer ${apiKey}`,
23
+ 'Content-Type': 'application/json',
24
+ },
25
+ timeout: 120000,
26
+ });
27
+ }
28
+ async chat(messages, options) {
29
+ const response = await this.httpClient.post('/chat/completions', {
30
+ model: this.model,
31
+ messages,
32
+ temperature: options?.temperature ?? 0,
33
+ max_tokens: options?.maxTokens ?? 512,
34
+ });
35
+ const choice = response.data.choices?.[0];
36
+ return {
37
+ content: choice?.message?.content || '',
38
+ finishReason: choice?.finish_reason || '',
39
+ usage: response.data.usage ? {
40
+ promptTokens: response.data.usage.prompt_tokens || 0,
41
+ completionTokens: response.data.usage.completion_tokens || 0,
42
+ totalTokens: response.data.usage.total_tokens || 0,
43
+ } : undefined,
44
+ };
45
+ }
46
+ }
47
+ exports.OpenAICompatibleClient = OpenAICompatibleClient;
48
+ class AnthropicClient {
49
+ provider = 'anthropic';
50
+ baseUrl;
51
+ model;
52
+ apiKey;
53
+ httpClient;
54
+ constructor(apiKey, baseUrl, model) {
55
+ this.apiKey = apiKey;
56
+ this.baseUrl = baseUrl;
57
+ this.model = model;
58
+ const baseURL = baseUrl || 'https://api.anthropic.com';
59
+ this.httpClient = axios_1.default.create({
60
+ baseURL,
61
+ headers: {
62
+ 'x-api-key': apiKey,
63
+ 'Content-Type': 'application/json',
64
+ 'anthropic-version': '2023-06-01',
65
+ },
66
+ timeout: 120000,
67
+ });
68
+ }
69
+ async chat(messages, options) {
70
+ const systemMessage = messages.find(m => m.role === 'system');
71
+ const chatMessages = messages.filter(m => m.role !== 'system');
72
+ const requestBody = {
73
+ model: this.model,
74
+ messages: chatMessages.map(m => ({
75
+ role: m.role === 'assistant' ? 'assistant' : 'user',
76
+ content: m.content,
77
+ })),
78
+ temperature: options?.temperature ?? 0,
79
+ max_tokens: options?.maxTokens ?? 1024,
80
+ };
81
+ if (systemMessage) {
82
+ requestBody.system = systemMessage.content;
83
+ }
84
+ const response = await this.httpClient.post('/v1/messages', requestBody);
85
+ return {
86
+ content: response.data.content?.[0]?.text || '',
87
+ finishReason: response.data.stop_reason || '',
88
+ usage: response.data.usage ? {
89
+ promptTokens: response.data.usage.input_tokens || 0,
90
+ completionTokens: response.data.usage.output_tokens || 0,
91
+ totalTokens: (response.data.usage.input_tokens || 0) + (response.data.usage.output_tokens || 0),
92
+ } : undefined,
93
+ };
94
+ }
95
+ }
96
+ exports.AnthropicClient = AnthropicClient;
97
+ function createLLMClient(provider, apiKey, baseUrl, model) {
98
+ if (provider === 'anthropic') {
99
+ return new AnthropicClient(apiKey, baseUrl, model);
100
+ }
101
+ return new OpenAICompatibleClient(apiKey, baseUrl, model);
102
+ }
103
+ //# sourceMappingURL=client.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"client.js","sourceRoot":"","sources":["../src/client.ts"],"names":[],"mappings":";;;;;;AAiIA,0CAUC;AA3ID,kDAA6C;AAkB7C,MAAa,sBAAsB;IAC1B,QAAQ,GAAiB,QAAQ,CAAC;IAClC,OAAO,CAAS;IAChB,KAAK,CAAS;IACb,MAAM,CAAS;IACf,UAAU,CAAgB;IAElC,YAAY,MAAc,EAAE,OAAe,EAAE,KAAa;QACxD,IAAI,CAAC,MAAM,GAAG,MAAM,CAAC;QACrB,IAAI,CAAC,OAAO,GAAG,OAAO,CAAC;QACvB,IAAI,CAAC,KAAK,GAAG,KAAK,CAAC;QACnB,IAAI,CAAC,UAAU,GAAG,eAAK,CAAC,MAAM,CAAC;YAC7B,OAAO,EAAE,OAAO;YAChB,OAAO,EAAE;gBACP,eAAe,EAAE,UAAU,MAAM,EAAE;gBACnC,cAAc,EAAE,kBAAkB;aACnC;YACD,OAAO,EAAE,MAAM;SAChB,CAAC,CAAC;IACL,CAAC;IAED,KAAK,CAAC,IAAI,CAAC,QAA6C,EAAE,OAGzD;QAKC,MAAM,QAAQ,GAAG,MAAM,IAAI,CAAC,UAAU,CAAC,IAAI,CAAC,mBAAmB,EAAE;YAC/D,KAAK,EAAE,IAAI,CAAC,KAAK;YACjB,QAAQ;YACR,WAAW,EAAE,OAAO,EAAE,WAAW,IAAI,CAAC;YACtC,UAAU,EAAE,OAAO,EAAE,SAAS,IAAI,GAAG;SACtC,CAAC,CAAC;QAEH,MAAM,MAAM,GAAG,QAAQ,CAAC,IAAI,CAAC,OAAO,EAAE,CAAC,CAAC,CAAC,CAAC;QAC1C,OAAO;YACL,OAAO,EAAE,MAAM,EAAE,OAAO,EAAE,OAAO,IAAI,EAAE;YACvC,YAAY,EAAE,MAAM,EAAE,aAAa,IAAI,EAAE;YACzC,KAAK,EAAE,QAAQ,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC;gBAC3B,YAAY,EAAE,QAAQ,CAAC,IAAI,CAAC,KAAK,CAAC,aAAa,IAAI,CAAC;gBACpD,gBAAgB,EAAE,QAAQ,CAAC,IAAI,CAAC,KAAK,CAAC,iBAAiB,IAAI,CAAC;gBAC5D,WAAW,EAAE,QAAQ,CAAC,IAAI,CAAC,KAAK,CAAC,YAAY,IAAI,CAAC;aACnD,CAAC,CAAC,CAAC,SAAS;SACd,CAAC;IACJ,CAAC;CACF;AA/CD,wDA+CC;AAED,MAAa,eAAe;IACnB,QAAQ,GAAiB,WAAW,CAAC;IACrC,OAAO,CAAS;IAChB,KAAK,CAAS;IACb,MAAM,CAAS;IACf,UAAU,CAAgB;IAElC,YAAY,MAAc,EAAE,OAAe,EAAE,KAAa;QACxD,IAAI,CAAC,MAAM,GAAG,MAAM,CAAC;QACrB,IAAI,CAAC,OAAO,GAAG,OAAO,CAAC;QACvB,IAAI,CAAC,KAAK,GAAG,KAAK,CAAC;QACnB,MAAM,OAAO,GAAG,OAAO,IAAI,2BAA2B,CAAC;QACvD,IAAI,CAAC,UAAU,GAAG,eAAK,CAAC,MAAM,CAAC;YAC7B,OAAO;YACP,OAAO,EAAE;gBACP,WAAW,EAAE,MAAM;gBACnB,cAAc,EAAE,kBAAkB;gBAClC,mBAAmB,EAAE,YAAY;aAClC;YACD,OAAO,EAAE,MAAM;SAChB,CAAC,CAAC;IACL,CAAC;IAED,KAAK,CAAC,IAAI,CAAC,QAA6C,EAAE,OAGzD;QAKC,MAAM,aAAa,GAAG,QAAQ,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,IAAI,KAAK,QAAQ,CAAC,CAAC;QAC9D,MAAM,YAAY,GAAG,QAAQ,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,IAAI,KAAK,QAAQ,CAAC,CAAC;QAE/D,MAAM,WAAW,GAAQ;YACvB,KAAK,EAAE,IAAI,CAAC,KAAK;YACjB,QAAQ,EAAE,YAAY,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC;gBAC/B,IAAI,EAAE,CAAC,CAAC,IAAI,KAAK,WAAW,CAAC,CAAC,CAAC,WAAW,CAAC,CAAC,CAAC,MAAM;gBACnD,OAAO,EAAE,CAAC,CAAC,OAAO;aACnB,CAAC,CAAC;YACH,WAAW,EAAE,OAAO,EAAE,WAAW,IAAI,CAAC;YACtC,UAAU,EAAE,OAAO,EAAE,SAAS,IAAI,IAAI;SACvC,CAAC;QAEF,IAAI,aAAa,EAAE,CAAC;YAClB,WAAW,CAAC,MAAM,GAAG,aAAa,CAAC,OAAO,CAAC;QAC7C,CAAC;QAED,MAAM,QAAQ,GAAG,MAAM,IAAI,CAAC,UAAU,CAAC,IAAI,CAAC,cAAc,EAAE,WAAW,CAAC,CAAC;QAEzE,OAAO;YACL,OAAO,EAAE,QAAQ,CAAC,IAAI,CAAC,OAAO,EAAE,CAAC,CAAC,CAAC,EAAE,IAAI,IAAI,EAAE;YAC/C,YAAY,EAAE,QAAQ,CAAC,IAAI,CAAC,WAAW,IAAI,EAAE;YAC7C,KAAK,EAAE,QAAQ,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC;gBAC3B,YAAY,EAAE,QAAQ,CAAC,IAAI,CAAC,KAAK,CAAC,YAAY,IAAI,CAAC;gBACnD,gBAAgB,EAAE,QAAQ,CAAC,IAAI,CAAC,KAAK,CAAC,aAAa,IAAI,CAAC;gBACxD,WAAW,EAAE,CAAC,QAAQ,CAAC,IAAI,CAAC,KAAK,CAAC,YAAY,IAAI,CAAC,CAAC,GAAG,CAAC,QAAQ,CAAC,IAAI,CAAC,KAAK,CAAC,aAAa,IAAI,CAAC,CAAC;aAChG,CAAC,CAAC,CAAC,SAAS;SACd,CAAC;IACJ,CAAC;CACF;AA5DD,0CA4DC;AAED,SAAgB,eAAe,CAC7B,QAAsB,EACtB,MAAc,EACd,OAAe,EACf,KAAa;IAEb,IAAI,QAAQ,KAAK,WAAW,EAAE,CAAC;QAC7B,OAAO,IAAI,eAAe,CAAC,MAAM,EAAE,OAAO,EAAE,KAAK,CAAC,CAAC;IACrD,CAAC;IACD,OAAO,IAAI,sBAAsB,CAAC,MAAM,EAAE,OAAO,EAAE,KAAK,CAAC,CAAC;AAC5D,CAAC"}
@@ -0,0 +1,57 @@
1
+ import { LLMClient } from './client.js';
2
+ import { Benchmark } from './benchmarks.js';
3
+ export interface EvaluationResponse {
4
+ content: string;
5
+ finishReason: string;
6
+ usage?: {
7
+ promptTokens: number;
8
+ completionTokens: number;
9
+ totalTokens: number;
10
+ };
11
+ }
12
+ export interface EvaluationResult {
13
+ benchmark: string;
14
+ model: string;
15
+ total: number;
16
+ correct: number;
17
+ accuracy: number;
18
+ timestamp: string;
19
+ seed?: number;
20
+ judge?: string;
21
+ }
22
+ interface EvaluatorOptions {
23
+ timeout: number;
24
+ retries: number;
25
+ temperature: number;
26
+ judgeTemperature?: number;
27
+ }
28
+ export declare class Evaluator {
29
+ private client;
30
+ private judgeClient;
31
+ private model;
32
+ private options;
33
+ constructor(client: LLMClient, options?: Partial<EvaluatorOptions>, judgeClient?: LLMClient);
34
+ private buildPrompt;
35
+ private stripThinkingTags;
36
+ evaluate(benchmark: Benchmark, item: any): Promise<EvaluationResponse>;
37
+ checkAnswer(benchmark: Benchmark, item: any, response: EvaluationResponse): boolean;
38
+ evaluateAndCheckWithJudge(benchmark: Benchmark, item: any, modelResponse: EvaluationResponse): Promise<{
39
+ correct: boolean;
40
+ judgeResponse?: string;
41
+ }>;
42
+ private checkMathAnswer;
43
+ private extractFinalAnswer;
44
+ private extractNumber;
45
+ private checkCodeAnswer;
46
+ private checkTypeScriptAnswer;
47
+ private checkSqlAnswer;
48
+ private extractCode;
49
+ private checkMultipleChoice;
50
+ private checkTruthfulQA;
51
+ private checkBashAnswer;
52
+ private checkBBHAnswer;
53
+ private getMaxTokens;
54
+ private sleep;
55
+ }
56
+ export {};
57
+ //# sourceMappingURL=evaluator.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"evaluator.d.ts","sourceRoot":"","sources":["../src/evaluator.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,SAAS,EAAE,MAAM,aAAa,CAAC;AACxC,OAAO,EAAE,SAAS,EAAE,MAAM,iBAAiB,CAAC;AAG5C,MAAM,WAAW,kBAAkB;IACjC,OAAO,EAAE,MAAM,CAAC;IAChB,YAAY,EAAE,MAAM,CAAC;IACrB,KAAK,CAAC,EAAE;QACN,YAAY,EAAE,MAAM,CAAC;QACrB,gBAAgB,EAAE,MAAM,CAAC;QACzB,WAAW,EAAE,MAAM,CAAC;KACrB,CAAC;CACH;AAED,MAAM,WAAW,gBAAgB;IAC/B,SAAS,EAAE,MAAM,CAAC;IAClB,KAAK,EAAE,MAAM,CAAC;IACd,KAAK,EAAE,MAAM,CAAC;IACd,OAAO,EAAE,MAAM,CAAC;IAChB,QAAQ,EAAE,MAAM,CAAC;IACjB,SAAS,EAAE,MAAM,CAAC;IAClB,IAAI,CAAC,EAAE,MAAM,CAAC;IACd,KAAK,CAAC,EAAE,MAAM,CAAC;CAChB;AAED,UAAU,gBAAgB;IACxB,OAAO,EAAE,MAAM,CAAC;IAChB,OAAO,EAAE,MAAM,CAAC;IAChB,WAAW,EAAE,MAAM,CAAC;IACpB,gBAAgB,CAAC,EAAE,MAAM,CAAC;CAC3B;AAED,qBAAa,SAAS;IACpB,OAAO,CAAC,MAAM,CAAY;IAC1B,OAAO,CAAC,WAAW,CAAmB;IACtC,OAAO,CAAC,KAAK,CAAS;IACtB,OAAO,CAAC,OAAO,CAAmB;gBAEtB,MAAM,EAAE,SAAS,EAAE,OAAO,GAAE,OAAO,CAAC,gBAAgB,CAAM,EAAE,WAAW,CAAC,EAAE,SAAS;IAY/F,OAAO,CAAC,WAAW;IAsBnB,OAAO,CAAC,iBAAiB;IAOnB,QAAQ,CAAC,SAAS,EAAE,SAAS,EAAE,IAAI,EAAE,GAAG,GAAG,OAAO,CAAC,kBAAkB,CAAC;IA+B5E,WAAW,CAAC,SAAS,EAAE,SAAS,EAAE,IAAI,EAAE,GAAG,EAAE,QAAQ,EAAE,kBAAkB,GAAG,OAAO;IAgC7E,yBAAyB,CAC7B,SAAS,EAAE,SAAS,EACpB,IAAI,EAAE,GAAG,EACT,aAAa,EAAE,kBAAkB,GAChC,OAAO,CAAC;QAAE,OAAO,EAAE,OAAO,CAAC;QAAC,aAAa,CAAC,EAAE,MAAM,CAAA;KAAE,CAAC;IAsFxD,OAAO,CAAC,eAAe;IAcvB,OAAO,CAAC,kBAAkB;IAW1B,OAAO,CAAC,aAAa;IAKrB,OAAO,CAAC,eAAe;IAgBvB,OAAO,CAAC,qBAAqB;IAW7B,OAAO,CAAC,cAAc;IAiBtB,OAAO,CAAC,WAAW;IA2BnB,OAAO,CAAC,mBAAmB;IA2D3B,OAAO,CAAC,eAAe;IAuBvB,OAAO,CAAC,eAAe;IAmBvB,OAAO,CAAC,cAAc;IAkEtB,OAAO,CAAC,YAAY;IAIpB,OAAO,CAAC,KAAK;CAGd"}
@@ -0,0 +1,410 @@
1
+ "use strict";
2
+ Object.defineProperty(exports, "__esModule", { value: true });
3
+ exports.Evaluator = void 0;
4
+ const child_process_1 = require("child_process");
5
+ class Evaluator {
6
+ client;
7
+ judgeClient;
8
+ model;
9
+ options;
10
+ constructor(client, options = {}, judgeClient) {
11
+ this.client = client;
12
+ this.judgeClient = judgeClient || null;
13
+ this.model = client.model;
14
+ this.options = {
15
+ timeout: options.timeout ?? 120,
16
+ retries: options.retries ?? 3,
17
+ temperature: options.temperature ?? 0,
18
+ judgeTemperature: options.judgeTemperature ?? 0,
19
+ };
20
+ }
21
+ buildPrompt(benchmark, item) {
22
+ const template = benchmark.promptTemplate || '{question}';
23
+ // Replace common placeholders
24
+ let prompt = template
25
+ .replace('{question}', item.question || item.problem || item.prompt || '')
26
+ .replace('{text}', item.text || item.question || item.problem || item.prompt || '')
27
+ .replace('{prompt}', item.prompt || item.question || item.problem || '');
28
+ if (item.choices) {
29
+ const choices = Array.isArray(item.choices)
30
+ ? item.choices
31
+ : Object.values(item.choices);
32
+ const choicesText = choices
33
+ .map((c, i) => `${String.fromCharCode(65 + i)}. ${c}`)
34
+ .join('\n');
35
+ prompt = prompt.replace('{choices}', choicesText);
36
+ }
37
+ return prompt;
38
+ }
39
+ stripThinkingTags(content) {
40
+ return content
41
+ .replace(/<think>[\s\S]*?<\/think>/gi, '')
42
+ .replace(/<think>[\s\S]*?<\/thinking>/gi, '')
43
+ .trim();
44
+ }
45
+ async evaluate(benchmark, item) {
46
+ const prompt = this.buildPrompt(benchmark, item);
47
+ let lastError = null;
48
+ for (let attempt = 0; attempt < this.options.retries; attempt++) {
49
+ try {
50
+ const response = await this.client.chat([{ role: 'user', content: prompt }], {
51
+ temperature: this.options.temperature,
52
+ maxTokens: this.getMaxTokens(benchmark),
53
+ });
54
+ return {
55
+ content: response.content,
56
+ finishReason: response.finishReason,
57
+ usage: response.usage,
58
+ };
59
+ }
60
+ catch (error) {
61
+ lastError = error;
62
+ if (attempt < this.options.retries - 1) {
63
+ const delay = Math.pow(2, attempt) * 1000;
64
+ await this.sleep(delay);
65
+ }
66
+ }
67
+ }
68
+ throw lastError || new Error('Evaluation failed');
69
+ }
70
+ checkAnswer(benchmark, item, response) {
71
+ const answerField = benchmark.answerField || 'answer';
72
+ const correctAnswer = item[answerField];
73
+ if (correctAnswer === undefined || correctAnswer === null)
74
+ return false;
75
+ const responseText = response.content.trim();
76
+ switch (benchmark.type) {
77
+ case 'math_reasoning':
78
+ return this.checkMathAnswer(responseText, correctAnswer);
79
+ case 'code':
80
+ return this.checkCodeAnswer(responseText, correctAnswer);
81
+ case 'typescript':
82
+ return this.checkTypeScriptAnswer(responseText, correctAnswer);
83
+ case 'sql':
84
+ return this.checkSqlAnswer(responseText, correctAnswer);
85
+ case 'truthfulqa':
86
+ return this.checkTruthfulQA(responseText, correctAnswer, item);
87
+ case 'commonsense':
88
+ case 'reasoning':
89
+ case 'knowledge':
90
+ return this.checkMultipleChoice(responseText, correctAnswer, item);
91
+ case 'bbh':
92
+ return this.checkBBHAnswer(responseText, correctAnswer);
93
+ case 'terminal':
94
+ return this.checkBashAnswer(responseText, correctAnswer);
95
+ default:
96
+ return responseText.toLowerCase().includes(correctAnswer.toLowerCase());
97
+ }
98
+ }
99
+ async evaluateAndCheckWithJudge(benchmark, item, modelResponse) {
100
+ if (!benchmark.useJudge || !this.judgeClient) {
101
+ const correct = this.checkAnswer(benchmark, item, modelResponse);
102
+ return { correct, judgeResponse: undefined };
103
+ }
104
+ let judgePrompt = benchmark.judgePromptTemplate || '';
105
+ const cleanResponse = this.stripThinkingTags(modelResponse.content);
106
+ // Build prompt based on benchmark type
107
+ switch (benchmark.id) {
108
+ case 'truthfulqa': {
109
+ const labeledChoices = item.labeled_choices || [];
110
+ judgePrompt = judgePrompt
111
+ .replace('{question}', item.question || '')
112
+ .replace('{all_choices}', labeledChoices.join('\n'))
113
+ .replace('{model_response}', cleanResponse);
114
+ break;
115
+ }
116
+ case 'humaneval':
117
+ case 'mbpp': {
118
+ const question = item.question || item.text || item.prompt || '';
119
+ const testCases = item.test_cases || item.test_list || [];
120
+ const testStr = Array.isArray(testCases)
121
+ ? testCases.map((tc, i) => {
122
+ const input = tc.split('===')[0]?.trim() || '';
123
+ const expected = tc.split('===')[1]?.trim() || '';
124
+ return `Test ${i + 1}: Input: ${input} Expected: ${expected}`;
125
+ }).join('\n')
126
+ : JSON.stringify(testCases);
127
+ judgePrompt = judgePrompt
128
+ .replace('{question}', question)
129
+ .replace('{test_cases}', testStr)
130
+ .replace('{model_response}', cleanResponse);
131
+ break;
132
+ }
133
+ case 'nl2bash': {
134
+ const answerField = benchmark.answerField || 'cmd';
135
+ judgePrompt = judgePrompt
136
+ .replace('{question}', item.question || '')
137
+ .replace('{correct_answer}', item[answerField] || '')
138
+ .replace('{model_response}', cleanResponse);
139
+ break;
140
+ }
141
+ case 'spider': {
142
+ const answerField = benchmark.answerField || 'query';
143
+ judgePrompt = judgePrompt
144
+ .replace('{question}', item.question || '')
145
+ .replace('{correct_answer}', item[answerField] || '')
146
+ .replace('{model_response}', cleanResponse);
147
+ break;
148
+ }
149
+ case 'math': {
150
+ const answerField = benchmark.answerField || 'answer';
151
+ judgePrompt = judgePrompt
152
+ .replace('{question}', item.question || '')
153
+ .replace('{correct_answer}', item[answerField] || '')
154
+ .replace('{model_response}', cleanResponse);
155
+ break;
156
+ }
157
+ default: {
158
+ judgePrompt = judgePrompt
159
+ .replace('{question}', item.question || '')
160
+ .replace('{model_response}', cleanResponse);
161
+ }
162
+ }
163
+ try {
164
+ const judgeResponse = await this.judgeClient.chat([{ role: 'user', content: judgePrompt }], { temperature: this.options.judgeTemperature ?? 0, maxTokens: 2000 });
165
+ const judgeAnswer = judgeResponse.content.trim().toUpperCase();
166
+ const isCorrect = judgeAnswer.includes('YES');
167
+ return {
168
+ correct: isCorrect,
169
+ judgeResponse: judgeResponse.content
170
+ };
171
+ }
172
+ catch (error) {
173
+ console.error('Judge evaluation failed:', error);
174
+ return { correct: false, judgeResponse: `Error: ${error}` };
175
+ }
176
+ }
177
+ checkMathAnswer(response, correctAnswer) {
178
+ const extractedResponse = this.extractFinalAnswer(response);
179
+ const extractedAnswer = this.extractFinalAnswer(correctAnswer);
180
+ const responseNum = this.extractNumber(extractedResponse);
181
+ const answerNum = this.extractNumber(extractedAnswer);
182
+ if (responseNum !== null && answerNum !== null) {
183
+ return Math.abs(responseNum - answerNum) < 0.01;
184
+ }
185
+ return extractedResponse.toLowerCase() === extractedAnswer.toLowerCase();
186
+ }
187
+ extractFinalAnswer(text) {
188
+ if (text.includes('####')) {
189
+ return text.split('####').pop()?.trim() || text;
190
+ }
191
+ const numbers = text.match(/-?\d+\.?\d*/g);
192
+ if (numbers && numbers.length > 0) {
193
+ return numbers[numbers.length - 1];
194
+ }
195
+ return text.trim();
196
+ }
197
+ extractNumber(text) {
198
+ const match = text.match(/-?\d+\.?\d*/);
199
+ return match ? parseFloat(match[0]) : null;
200
+ }
201
+ checkCodeAnswer(response, correctAnswer) {
202
+ const responseCode = this.extractCode(response);
203
+ if (!responseCode || responseCode.length === 0) {
204
+ return false;
205
+ }
206
+ try {
207
+ const encoded = Buffer.from(responseCode).toString('base64');
208
+ (0, child_process_1.execSync)(`python3 -c "import base64; code=base64.b64decode('${encoded}').decode(); compile(code,'<string>','exec')"`, { stdio: 'pipe' });
209
+ return true;
210
+ }
211
+ catch {
212
+ return false;
213
+ }
214
+ }
215
+ checkTypeScriptAnswer(response, correctAnswer) {
216
+ // For TypeScript, check if response contains TypeScript-like code
217
+ // (function declaration with types)
218
+ const hasFunction = /function\s+\w+/.test(response);
219
+ const hasArrowFunction = /=>\s*{/.test(response);
220
+ const hasTypeScriptTypes = /:\s*(number|string|boolean|any|void|never)/.test(response);
221
+ const hasExport = /export\s+/.test(response);
222
+ return hasFunction || hasArrowFunction;
223
+ }
224
+ checkSqlAnswer(response, correctAnswer) {
225
+ // For SQL, check if response contains SQL keywords
226
+ const hasSelect = /SELECT/i.test(response);
227
+ const hasFrom = /FROM/i.test(response);
228
+ // Extract SQL query from response if it's in a code block
229
+ const sqlMatch = response.match(/```sql\n?([\s\S]*?)```/i) ||
230
+ response.match(/```\n?([\s\S]*?)```/i);
231
+ if (sqlMatch) {
232
+ const extractedSql = sqlMatch[1].trim().toUpperCase();
233
+ return /SELECT/.test(extractedSql) && /FROM/.test(extractedSql);
234
+ }
235
+ return hasSelect && hasFrom;
236
+ }
237
+ extractCode(text) {
238
+ // Try to find code blocks first
239
+ const codeBlockMatch = text.match(/```(?:\w+)?\n([\s\S]*?)```/);
240
+ if (codeBlockMatch)
241
+ return codeBlockMatch[1].trim();
242
+ // Try to find Python function definitions and extract from there
243
+ const funcMatch = text.match(/(?:^|\n)(def \w+.*?(?=\n(?:[^\s]|$)))/);
244
+ if (funcMatch) {
245
+ return text.slice(text.indexOf(funcMatch[1])).trim();
246
+ }
247
+ // If no code block or function found, try to find a line that looks like Python code
248
+ const lines = text.split('\n');
249
+ const codeStartIdx = lines.findIndex(line => line.trim().startsWith('def ') ||
250
+ line.trim().startsWith('class ') ||
251
+ line.trim().startsWith('import ') ||
252
+ line.trim().startsWith('from '));
253
+ if (codeStartIdx >= 0) {
254
+ return lines.slice(codeStartIdx).join('\n').trim();
255
+ }
256
+ return text.trim();
257
+ }
258
+ checkMultipleChoice(response, correctAnswer, item) {
259
+ const normalizedResponse = response.toUpperCase().trim();
260
+ // Handle numeric answer (e.g., MMLU returns 0, 1, 2, 3 as numbers)
261
+ let answerStr = String(correctAnswer);
262
+ let answerIndex;
263
+ if (/^[0-3]$/.test(answerStr)) {
264
+ // Numeric index (0-3)
265
+ answerIndex = parseInt(answerStr);
266
+ answerStr = String.fromCharCode(65 + answerIndex); // Convert to A, B, C, D
267
+ }
268
+ else {
269
+ // Letter answer (A, B, C, D)
270
+ answerStr = answerStr.toUpperCase().trim();
271
+ answerIndex = answerStr.charCodeAt(0) - 65;
272
+ }
273
+ const normalizedAnswer = answerStr;
274
+ // Direct match
275
+ if (normalizedResponse === normalizedAnswer)
276
+ return true;
277
+ // Single letter response (e.g., "A")
278
+ if (normalizedResponse.length === 1 && /^[A-D]$/.test(normalizedResponse)) {
279
+ return normalizedResponse === normalizedAnswer;
280
+ }
281
+ // Extract letter from various patterns
282
+ // "Choice A", "choice A", "Option B", "option B"
283
+ let letterMatch = normalizedResponse.match(/\b(CHOICE|OPTION)\s+([A-D])\b/);
284
+ if (letterMatch)
285
+ return letterMatch[2] === normalizedAnswer;
286
+ // "Answer is A", "answer is A", "is A"
287
+ letterMatch = normalizedResponse.match(/\b(ANSWER|IS)\s+([A-D])\b/);
288
+ if (letterMatch)
289
+ return letterMatch[2] === normalizedAnswer;
290
+ // "The correct answer is B"
291
+ letterMatch = normalizedResponse.match(/\bCORRECT\b.*?\b([A-D])\b/);
292
+ if (letterMatch)
293
+ return letterMatch[1] === normalizedAnswer;
294
+ // Single letter A, B, C, or D surrounded by word boundaries
295
+ letterMatch = normalizedResponse.match(/\b([A-D])\b/);
296
+ if (letterMatch)
297
+ return letterMatch[1] === normalizedAnswer;
298
+ const choices = Array.isArray(item.choices)
299
+ ? item.choices
300
+ : item.choices
301
+ ? Object.values(item.choices)
302
+ : [];
303
+ if (answerIndex >= 0 && answerIndex < choices.length) {
304
+ const expectedChoice = String(choices[answerIndex]).toUpperCase().trim();
305
+ // Check if response contains the expected choice or vice versa
306
+ return normalizedResponse.includes(expectedChoice) || expectedChoice.includes(normalizedResponse);
307
+ }
308
+ return false;
309
+ }
310
+ checkTruthfulQA(response, correctAnswer, item) {
311
+ const normalizedResponse = response.toUpperCase().trim();
312
+ // Get correct choice text
313
+ const answerIndex = typeof correctAnswer === 'number' ? correctAnswer : parseInt(correctAnswer);
314
+ const choices = Array.isArray(item.choices) ? item.choices : [];
315
+ if (answerIndex < 0 || answerIndex >= choices.length)
316
+ return false;
317
+ const correctChoice = choices[answerIndex].toUpperCase().trim();
318
+ // Check if response contains the correct answer or vice versa
319
+ if (normalizedResponse.includes(correctChoice) || correctChoice.includes(normalizedResponse)) {
320
+ return true;
321
+ }
322
+ // Check if it's the same first 20+ characters
323
+ if (normalizedResponse.slice(0, 20) === correctChoice.slice(0, 20)) {
324
+ return true;
325
+ }
326
+ return false;
327
+ }
328
+ checkBashAnswer(response, correctAnswer) {
329
+ const normalizedResponse = response.toLowerCase().trim();
330
+ const normalizedAnswer = correctAnswer.toLowerCase().trim();
331
+ // Exact match
332
+ if (normalizedResponse === normalizedAnswer)
333
+ return true;
334
+ // Check if main command is present (partial credit)
335
+ const answerParts = normalizedAnswer.split(/\s+/);
336
+ if (answerParts.length > 0) {
337
+ const mainCmd = answerParts[0];
338
+ if (mainCmd && normalizedResponse.includes(mainCmd)) {
339
+ return true; // Partial credit - main command matches
340
+ }
341
+ }
342
+ return false;
343
+ }
344
+ checkBBHAnswer(response, correctAnswer) {
345
+ // Strip thinking tags
346
+ const strippedResponse = response
347
+ .replace(/<think>[\s\S]*?<\/think>/gi, '')
348
+ .replace(/<think>[\s\S]*?<\/thinking>/gi, '')
349
+ .trim();
350
+ // Extract final answer after #### if present
351
+ let finalAnswer = strippedResponse;
352
+ if (strippedResponse.includes('####')) {
353
+ finalAnswer = strippedResponse.split('####').pop()?.trim() || strippedResponse;
354
+ }
355
+ const normalizedResponse = finalAnswer.toLowerCase().trim();
356
+ const normalizedAnswer = correctAnswer.toLowerCase().trim();
357
+ // Exact match
358
+ if (normalizedResponse === normalizedAnswer)
359
+ return true;
360
+ // Handle True/False style answers
361
+ if (normalizedAnswer === 'true' || normalizedAnswer === 'false') {
362
+ if (normalizedResponse === 'true' || normalizedResponse === 'false') {
363
+ return normalizedResponse === normalizedAnswer;
364
+ }
365
+ }
366
+ // Handle Yes/No style answers
367
+ if (normalizedAnswer === 'yes' || normalizedAnswer === 'no') {
368
+ if (normalizedResponse === 'yes' || normalizedResponse === 'no') {
369
+ return normalizedResponse === normalizedAnswer;
370
+ }
371
+ }
372
+ // Handle A/B/C/D style answers (multiple choice)
373
+ if (/^[a-d]$/i.test(normalizedAnswer)) {
374
+ const answerChar = normalizedAnswer.toUpperCase();
375
+ if (normalizedResponse === answerChar)
376
+ return true;
377
+ // Check if response contains the letter
378
+ if (new RegExp(`\\b${answerChar}\\b`).test(normalizedResponse))
379
+ return true;
380
+ }
381
+ // Handle numeric answers
382
+ const responseNum = normalizedResponse.match(/-?\d+\.?\d*/);
383
+ const answerNum = normalizedAnswer.match(/-?\d+\.?\d*/);
384
+ if (responseNum && answerNum) {
385
+ return parseFloat(responseNum[0]) === parseFloat(answerNum[0]);
386
+ }
387
+ // Handle comma-separated lists (like word sorting)
388
+ if (normalizedAnswer.includes(',') && normalizedResponse.includes(',')) {
389
+ const respSet = new Set(normalizedResponse.split(',').map(s => s.trim()));
390
+ const ansSet = new Set(normalizedAnswer.split(',').map(s => s.trim()));
391
+ // Check if sets are equal
392
+ if (respSet.size === ansSet.size && [...respSet].every(x => ansSet.has(x))) {
393
+ return true;
394
+ }
395
+ }
396
+ // Partial match - check if the answer appears in the response
397
+ if (normalizedResponse.includes(normalizedAnswer) || normalizedAnswer.includes(normalizedResponse)) {
398
+ return true;
399
+ }
400
+ return false;
401
+ }
402
+ getMaxTokens(benchmark) {
403
+ return 100000;
404
+ }
405
+ sleep(ms) {
406
+ return new Promise((resolve) => setTimeout(resolve, ms));
407
+ }
408
+ }
409
+ exports.Evaluator = Evaluator;
410
+ //# sourceMappingURL=evaluator.js.map